Beispiel #1
0
    def parse(self, xml_detail):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_detail)
        publisher = self.parse_publisher(xml_detail)
        pub_year = self.parse_pubdate(xml_detail)
        tags = self.parse_tags(xml_detail)
        serie, serie_index = self.parse_serie(xml_detail)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:str(self.number)}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(str(self.number), cover)

            return mi
        else:
            return None
Beispiel #2
0
    def _metadata(self, book):
        authors = []
        if book['author']:
            for author in book['author']:
                for r in REMOVES:
                    author = r.sub("", author)
                authors.append(author)
        if not authors: authors = [u'佚名']

        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO
        mi = Metadata(book['title'])
        mi.authors = authors
        mi.author_sort = mi.authors[0]
        mi.publisher = book['publisher']
        mi.comments = book['summary']
        mi.isbn = book.get('isbn13', None)
        mi.tags = [t['name'] for t in book['tags']][:8]
        mi.rating = int(float(book['rating']['average']))
        mi.pubdate = self.str2date(book['pubdate'])
        mi.timestamp = datetime.datetime.now()
        mi.douban_id = book['id']
        mi.douban_author_intro = book['author_intro']
        mi.douban_subtitle = book.get('subtitle', None)
        mi.website = "https://book.douban.com/isbn/%s" % mi.isbn
        mi.source = u'豆瓣'

        mi.cover_url = book['images']['large']
        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        logging.debug("=================\ndouban metadata:\n%s" % mi)
        return mi
Beispiel #3
0
    def parse(self, xml_detail):
        data = xml_detail.split('\n')[1].split("|")
        self.log(data)

        title = data[1]
        authors = [data[0]]
        comments = data[13]
        isbn = data[3]
        publisher = data[6]
        pub_date_tmp = data[34].split('-')
        pub_date = datetime.datetime(int(pub_date_tmp[0]), int(pub_date_tmp[1]), int(pub_date_tmp[2]), tzinfo=utc_tz)
        if isbn is not None:
            isbn_tmp = re.sub("-", "", isbn)
            cover = "%s/images/covers/%s.jpg"%(self.plugin.BASE_URL, isbn_tmp)
        else:
            cover = None

        if title is not None and authors is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.publisher = publisher
            mi.pubdate = pub_date
            mi.isbn = isbn
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            return None
Beispiel #4
0
    def parse(self, xml_detail, xml_more_info):
        title = self.parse_title(xml_detail)
        authors = self.parse_authors(xml_detail)
        comments = self.parse_comments(xml_detail)
        rating = self.parse_rating(xml_detail)
        isbn = self.parse_isbn(xml_more_info)
        publisher = self.parse_publisher(xml_detail)
        tags = self.parse_tags(xml_detail, xml_more_info)
        serie, serie_index = self.parse_serie(xml_detail)
        pub_year = self.parse_pub_year(xml_detail, xml_more_info)
        cover = self.parse_cover(xml_detail)

        if title is not None and authors is not None:
            mi = Metadata(as_unicode(title), authors)
            mi.languages = {'ces'}
            mi.comments = as_unicode(comments)
            mi.identifiers = {self.plugin.name:self.ident}
            mi.rating = rating
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(self.ident, cover)

            return mi
        else:
            self.log('Result skipped for because title or authors not found')
            return None
Beispiel #5
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join( "%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors   = [ info.get(u'作者', u'佚名') ]
        mi.author_sort = mi.authors[0]
        mi.isbn      = BAIKE_ISBN
        mi.tags      = baike.get_tags()
        mi.pubdate   = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments  = re.sub(r'\[\d+\]$', "", baike.get_summary() )
        mi.website   = baike.http.url
        mi.source    = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try: mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except: pass
        return mi
Beispiel #6
0
    def parse(self, xml_detail):
        sys_ident = title = isbn = publisher = pub_year = serie = serie_index = cover = None
        authors = []
        tags = []
        xpath = self.XPath('//table[@id="record"]//tr')
        for row in xpath(xml_detail):
            ch = row.getchildren()
            txt = ch[0].text.strip()
            data = self.normalize(ch[1].text)
            if txt.startswith('245') and title is None:
                title = self.parse_title(data)
            if txt.startswith('246'):
                title = self.parse_title(data)
            elif txt.startswith('100') or txt.startswith('700'):
                res = self.parse_author(data)
                if res is not None:
                    authors.append(res)
            elif txt == 'SYS':
                sys_ident = data.strip()
            elif txt =='020':
                isbn = self.parse_isbn(data)
            elif txt == '260':
                publisher, pub_year = self.parse_publisher(data)
            elif txt.startswith('490') and serie is None:
                serie, serie_index = self.parse_serie(data)
            elif txt == '655 7':
                tags.append(self.parse_tags(data))

        if isbn is not None and isbn != '':
            cover = self.parse_cover(isbn)

        if title is not None and len(authors) > 0 and sys_ident is not None:
            mi = Metadata(title, authors)
            mi.languages = {'ces'}
            mi.identifiers = {self.plugin.name:sys_ident}
            mi.tags = tags
            mi.publisher = publisher
            mi.pubdate = pub_year
            mi.isbn = isbn
            mi.series = serie
            mi.series_index = serie_index
            mi.cover_url = cover

            if cover:
                self.plugin.cache_identifier_to_cover_url(sys_ident, cover)

            return mi
        else:
            self.log('Data not found')
            return None
Beispiel #7
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata

        info = baike.get_info()
        logging.debug("\n".join("%s:\t%s" % v for v in info.items()))

        mi = Metadata(info['title'])
        plat = "网络小说平台"
        plat = info.get(u'首发状态', plat)
        plat = info.get(u'首发网站', plat)
        plat = plat.replace(u'首发', '')
        mi.publisher = info.get(u'连载平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'
        mi.provider_key = KEY
        mi.provider_value = baike.get_id()

        if self.copy_image and mi.cover_url:
            logging.debug("fetching cover: %s", mi.cover_url)
            img = io.BytesIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完结' in info.get(u'连载状态', ""):
            day = re.findall('\d*-\d*-\d*', info[u'连载状态'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #8
0
    def _metadata(self, baike):
        from calibre.ebooks.metadata.book.base import Metadata
        from cStringIO import StringIO

        info = baike.get_info()
        print "\n".join("%s:\t%s" % v for v in info.items())

        mi = Metadata(info['title'])
        plat = "網絡小說平台"
        plat = info.get(u'首發狀態', plat)
        plat = info.get(u'首發網站', plat)
        plat = plat.replace(u'首發', '')
        mi.publisher = info.get(u'連載平台', plat)
        mi.authors = [info.get(u'作者', u'佚名')]
        mi.author_sort = mi.authors[0]
        mi.isbn = BAIKE_ISBN
        mi.tags = baike.get_tags()
        mi.pubdate = datetime.datetime.now()
        mi.timestamp = datetime.datetime.now()
        mi.cover_url = baike.get_image()
        mi.comments = re.sub(r'\[\d+\]$', "", baike.get_summary())
        mi.website = baike.http.url
        mi.source = u'百度百科'

        if self.copy_image:
            img = StringIO(urlopen(mi.cover_url).read())
            img_fmt = mi.cover_url.split(".")[-1]
            mi.cover_data = (img_fmt, img)

        if u'完結' in info.get(u'連載狀態', ""):
            day = re.findall('\d*-\d*-\d*', info[u'連載狀態'])
            try:
                mi.pubdate = datetime.datetime.strptime(day[0], '%Y-%m-%d')
            except:
                pass
        return mi
Beispiel #9
0
#!/usr/bin/env python
Beispiel #10
0
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [
                _.get('content') for _ in node if _.get('property') == property
            ][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6])
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath(
                '//meta[starts-with(@property, "og") or starts-with(@property, "books")]'
            )

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath(
                '//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([
                _ + u'(역자)'
                for _ in _format_list(book_info['translator']['name'])
            ])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(
                    ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
    def load_details(self, url, timeout):
        def _format_item(str):
            return re.sub('^"(.*)"$', '\\1', unescape(str))

        def _format_list(str):
            return [_.strip() for _ in _format_item(str).split(',')]

        def _find_meta(node, property):
            return [_.get('content') for _ in node if _.get('property') == property][0]

        def _format_date(date_text):
            year = int(date_text[0:4])
            month = int(date_text[4:6]) 
            day = int(date_text[6:])
            return datetime.datetime(year, month, day, tzinfo=utc_tz)

        try:
            response = self.browser.open(url, timeout=timeout)
            root = lxml.html.fromstring(response.read())

            # <meta> tag에서 불러오는 항목
            # 책ID, 제목, ISBN, 이미지URL, 평점
            meta = root.xpath('//meta[starts-with(@property, "og") or starts-with(@property, "books")]')

            # schema.org JSON에서 불러오는 항목
            # 제목, 저자, 책소개, 출판사
            ld_json = root.xpath('//script[@type="application/ld+json"]/text()')
            ld = [json.loads(_) for _ in ld_json]
            book_info = [_ for _ in ld if _['@type'] == 'Book'][0]
        except Exception as e:
            self.log.exception(e)

        ridibooks_id = re.search('id=([0-9]+)', url).group(1)
        isbn = _find_meta(meta, 'books:isbn')
        cover_url = _find_meta(meta, 'og:image')

        title = _find_meta(meta, 'og:title')
        authors = _format_list(book_info['author']['name'])
        if book_info.has_key('translator'):
            authors.extend([_ + u'(역자)' for _ in _format_list(book_info['translator']['name'])])

        mi = Metadata(title, authors)
        mi.set_identifier('ridibooks', ridibooks_id)

        mi.cover_url = cover_url
        mi.has_cover = bool(cover_url)

        mi.publisher = _format_item(book_info['publisher']['name'])
        mi.pubdate = _format_date(book_info['datePublished'])

        mi.comments = _format_item(book_info['description'])
        mi.rating = float(_find_meta(meta, 'books:rating:normalized_value'))

        series = re.search(u'(.*)\s*(\d+)권', title)
        if series:
            mi.series = series.group(1)
            mi.series_index = float(series.group(2))

        mi.language = 'Korean'
        mi.source_relevance = self.relevance

        if ridibooks_id:
            if isbn:
                self.plugin.cache_isbn_to_identifier(isbn, ridibooks_id)
            if cover_url:
                self.plugin.cache_identifier_to_cover_url(ridibooks_id, cover_url)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            isbn = self.extract_isbn(self.url)
        except:
            self.log.exception('No ISBN in URL: %r'%self.url)
            isbn = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not isbn:
            self.log.error('Could not find title/authors/Aladin id for %r'%self.url)
            self.log.error('Aladin: %r Title: %r Authors: %r'%(isbn, title, authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        #mi.set_identifier('isbn', isbn)
        mi.isbn = isbn
        self.isbn = isbn

        # ISBN-13
        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!
        if mi.has_cover:
            self.log.info('Cover URL: '+mi.cover_url)

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #13
0
    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the title of the book
        try:
            title_node = root.xpath('//span[@itemprop="name"]')
            self.title = title_node[0].text
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath('//span[@class="expandAuthorName"]')
            author_strings = author_node[0].text.split(",")
            #print(author_strings)
            for name in author_strings:
                self.authors.append(name)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Get the series of the book
        try:
            series_node = root.xpath('//b[contains(text(), "Serie")]/a')
            if len(series_node) > 0:
                self.series = series_node[0].text.split(": ")[0].strip()
                self.series_index = series_node[0].text.split(": ")[-1].strip()
            #    print("'%s'" % self.series)
            #    print("'%s'" % self.series_index)
        except:
            self.log.exception('Error parsing series for url: %r' % self.url)

        # Some books have ratings, let's use them.
        try:
            self.rating = 0.0
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            isbn_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span[@itemprop="identifier"]'
            )
            if len(isbn_node) > 0:
                self.isbn = isbn_node[0].text.replace("ISBN: ", "").strip()
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            comment_node = root.xpath('//meta[@name="description"]/@content')
            self.comments = comment_node[0]
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            cover_node = root.xpath(
                '//div[@class="bookDetailCoverCover"]/img/@src')
            self.cover_url = "https://mofibo.com" + cover_node[0]
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            publisher_node = root.xpath(
                '//div[@class="eBookContainer"]/b/span/a[@itemprop="brand"]')
            if len(publisher_node) > 0:
                self.publisher = publisher_node[0].text
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language_node = root.xpath('//b[@class="expanderLanguage"]')
            language = language_node[0].text.strip().replace("Sprog:",
                                                             "").replace(
                                                                 " ", "")
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            pubdate_node = root.xpath(
                '//div[@class="eBookContainer"]/b[contains(text(),"Udgivet:")]'
            )
            if len(pubdate_node) > 0:
                date_str = pubdate_node[0].text.replace("Udgivet:", "").strip()
                format_str = '%Y-%m-%d'  # The format
                self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Get the tags
        try:
            tags = []
            tags_node = root.xpath('//span[@itemprop="category"]')
            tags.append(tags_node[0].text.strip())
            self.tags = tags
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('mofibo', self.url)

        # Set rating
        if self.series:
            try:
                meta_data.series = self.series
                meta_data.series_index = self.series_index
            except:
                self.log.exception('Error loading series')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')
        # Set tags data
        if self.tags:
            try:
                meta_data.tags = self.tags
            except:
                self.log.exception('Error loading tags')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)
Beispiel #14
0
    def get_details(self):
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
            self.log.info(raw)
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for biblionet timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        if '<title>404 - ' in raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        try:
            # root = fromstring(clean_ascii_chars(raw))
            root = json.loads(raw)
            self.log.info(root)
        except:
            msg = 'Failed to parse book detail page: %r' % self.url
            self.log.exception(msg)
            return

        try:
            self.biblionetid = root['biblionetid']
        except:
            self.log.exception('Error parsing book id for url: %r' % self.url)
            self.biblionetid = None

        try:
            self.title = root['title'].strip()
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)
            self.title = None
            self.series_index = None

        try:
            self.authors = [root['authors'].strip()]
            self.log.info(self.authors)
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        try:
            self.cover_url = root['cover_url']
            self.log.info('Parsed URL for cover:%r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.biblionetid,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        try:
            self.publisher = root['publisher']
            self.log.info('Parsed publisher:%s' % self.publisher)
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        try:
            self.tags = root['categories'].replace('DDC: ', 'DDC:').replace(
                '-', '').split()[:-1]
            self.log.info('Parsed tags:%s' % self.tags)
        except:
            self.log.exception('Error parsing tags for url: %r' % self.url)

        try:
            self.pubdate = root['yr_published']
            self.log.info('Parsed publication date:%s' % self.pubdate)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        mi = Metadata(self.title, self.authors)
        mi.set_identifier('biblionet', self.biblionetid)

        if self.series_index:
            try:
                mi.series_index = float(self.series_index)
            except:
                self.log.exception('Error loading series')
        if self.relevance:
            try:
                mi.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        if self.cover_url:
            try:
                mi.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        if self.publisher:
            try:
                mi.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        if self.tags:
            try:
                mi.tags = self.tags
            except:
                self.log.exception('Error loading tags')
        if self.pubdate:
            try:
                if self.pubdate not in (self.yr_msg1, self.yr_msg2):
                    d = datetime.date(int(self.pubdate), 1, 1)
                    mi.pubdate = d
            except:
                self.log.exception('Error loading pubdate')

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
Beispiel #15
0
	def parse_details(self, root):
		isfdb_id = None
		title = None
		authors = []
		isbn = None
		publisher = None
		pubdate = None
		
		try:
			isfdb_id = re.search('(\d+)$', self.url).groups(0)[0]
		except:
			self.log.exception('Error parsing ISFDB ID for url: %r' % self.url)
        
		detail_nodes = root.xpath('//div[@id="content"]//td[@class="pubheader"]/ul/li')
		if not detail_nodes:
			detail_nodes = root.xpath('//div[@id="content"]/div/ul/li') # no table (on records with no image)

		for detail_node in detail_nodes:
			section = detail_node[0].text_content().strip().rstrip(':')
			#self.log.info(section)
			try:
				if section == 'Publication':
					title = detail_node[0].tail.strip()
					if not title:
						# assume an extra span with a transliterated title tooltip
						title = detail_node[1].text_content().strip()
                    #self.log.info(title)
				elif section == 'Authors' or section == 'Editors':
					for a in detail_node.xpath('.//a'):
						author = a.text_content().strip()
						if section.startswith('Editors'):
							authors.append(author + ' (Editor)')
						else:
							authors.append(author)
                    #self.log.info(authors)
				elif section == 'ISBN':
					isbn = detail_node[0].tail.strip('[] \n')
                    #self.log.info(isbn)
				elif section == 'Publisher':
					publisher = detail_node.xpath('a')[0].text_content().strip()
                    #self.log.info(publisher)
				elif section == 'Date':
					pubdate = self._convert_date_text(detail_node[0].tail.strip())
                    #self.log.info(pubdate)
			except:
				self.log.exception('Error parsing section %r for url: %r' % (section, self.url) )

		if not title or not authors or not isfdb_id:
			self.log.error('Could not find title/authors/ISFDB ID for %r' % self.url)
			self.log.error('ISFDB: %r Title: %r Authors: %r' % (isfdb_id, title,
				authors))
			return

		mi = Metadata(title, authors)
		mi.set_identifier('isfdb', isfdb_id)
		self.isfdb_id = isfdb_id

		if isbn:
			self.isbn = mi.isbn = isbn
		if publisher:
			mi.publisher = publisher
		if pubdate:
			mi.pubdate = pubdate
			
		try:
			mi.comments = self.parse_comments(root)
		except:
			self.log.exception('Error parsing comments for url: %r'%self.url)

		try:
			self.cover_url = self.parse_cover(root)
		except:
			self.log.exception('Error parsing cover for url: %r'%self.url)
		
		mi.has_cover = bool(self.cover_url)
		mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

		mi.source_relevance = self.relevance

		if self.isfdb_id:
			if self.isbn:
				self.plugin.cache_isbn_to_identifier(self.isbn, self.isfdb_id)

		self.plugin.clean_downloaded_metadata(mi)
		self.result_queue.put(mi)
    def parse_details(self, root):
        try:
            yes24_id = self.parse_yes24_id(self.url)
        except:
            self.log.exception('Error parsing YES24 id for url: %r'%self.url)
            yes24_id = None

        try:
            (title, series, series_index) = self.parse_title_series(root)
        except:
            self.log.exception('Error parsing title and series for url: %r'%self.url)
            title = series = series_index = None

        try:
            authors = self.parse_authors(root)
        except:
            self.log.exception('Error parsing authors for url: %r'%self.url)
            authors = []

        if not title or not authors or not yes24_id:
            self.log.error('Could not find title/authors/YES24 id for %r'%self.url)
            self.log.error('YES24: %r Title: %r Authors: %r'%(yes24_id, title,
                authors))
            return

        mi = Metadata(title, authors)
        if series:
            mi.series = series
            mi.series_index = series_index
        mi.set_identifier('yes24', yes24_id)
        self.yes24_id = yes24_id

        try:
            isbn = self.parse_isbn(root)
            if isbn:
                self.isbn = mi.isbn = isbn
        except:
            self.log.exception('Error parsing ISBN for url: %r'%self.url)

        try:
            mi.comments = self.parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r'%self.url)

        try:
            self.cover_url = self.parse_cover(root)
        except:
            self.log.exception('Error parsing cover for url: %r'%self.url)
        mi.has_cover = bool(self.cover_url)
        mi.cover_url = self.cover_url # This is purely so we can run a test for it!!!

        try:
            mi.publisher = self.parse_publisher(root)
        except:
            self.log.exception('Error parsing publisher for url: %r'%self.url)

        try:
            mi.pubdate = self.parse_published_date(root)
        except:
            self.log.exception('Error parsing published date for url: %r'%self.url)

        mi.language = 'ko'

        mi.source_relevance = self.relevance

        if self.yes24_id:
            if self.isbn:
                self.plugin.cache_isbn_to_identifier(self.isbn, self.yes24_id)

        self.plugin.clean_downloaded_metadata(mi)
        self.result_queue.put(mi)
    def get_details(self):
        '''
        The get_details() function for stripping the website for all information
        '''
        self.log.info("    Worker.get_details:")
        self.log.info("        self:     ", self)
        self.log.info("        self.url: ", self.url)

        # Parse the html code from the website
        try:
            raw = self.browser.open_novisit(
                self.url, timeout=self.timeout).read().strip()
        # Do some error handling if it fails to read data
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and e.getcode() == 404:
                self.log.error('URL malformed: %r' % self.url)
                return
            attr = getattr(e, 'args', [None])
            attr = attr if attr else [None]
            if isinstance(attr[0], socket.timeout):
                msg = 'Bookmeta for saxo timed out. Try again later.'
                self.log.error(msg)
            else:
                msg = 'Failed to make details query: %r' % self.url
                self.log.exception(msg)
            return

        # Do some error handling if the html code returned 404
        if "<title>404 - " == raw:
            self.log.error('URL malformed: %r' % self.url)
            return

        # Clean the html data a little
        try:
            root = parse(raw)
        except:
            self.log.error("Error cleaning HTML")
            return

        # Get the json data within the HTML code (some stuff is easier to get with json)
        try:
            json_raw = root.xpath('(//script[@type="application/ld+json"])[2]')
            json_root = json.loads(json_raw[0].text.strip())
            #print(json.dumps(json_root, indent=4, sort_keys=True))
        except:
            self.log.error("Error loading JSON data")
            return

        # Get the title of the book
        try:
            self.title = json_root['name']
        except:
            self.log.exception('Error parsing title for url: %r' % self.url)

        # Get the author of the book
        try:
            author_node = root.xpath(
                '//h2[@class="product-page-heading__autor"]//a')
            for name in author_node:
                self.authors.append(name.text.strip())
        except:
            self.log.exception('Error parsing authors for url: %r' % self.url)
            self.authors = None

        # Some books have ratings, let's use them.
        try:
            self.rating = float(json_root['aggregateRating']['ratingValue'])
        except:
            self.log.exception('Error parsing rating for url: %r' % self.url)
            self.rating = 0.0

        # Get the ISBN number from the site
        try:
            self.isbn = json_root['isbn']
        except:
            self.log.exception('Error parsing isbn for url: %r' % self.url)
            self.isbn = None

        # Get the comments/blurb for the book
        try:
            self.comments = parse_comments(root)
        except:
            self.log.exception('Error parsing comments for url: %r' % self.url)
            self.comments = None

        # Parse the cover url for downloading the cover.
        try:
            self.cover_url = json_root['image']
            self.log.info('    Parsed URL for cover: %r' % self.cover_url)
            self.plugin.cache_identifier_to_cover_url(self.isbn,
                                                      self.cover_url)
        except:
            self.log.exception('Error parsing cover for url: %r' % self.url)
            self.has_cover = bool(self.cover_url)

        # Get the publisher name
        try:
            self.publisher = json_root['publisher']['name']
        except:
            self.log.exception('Error parsing publisher for url: %r' %
                               self.url)

        # Get the language of the book. Only english and danish are supported tho
        try:
            language = json_root['inLanguage']['name']
            language = self.lang_map.get(language, None)
            self.language = language
        except:
            self.log.exception('Error parsing language for url: %r' % self.url)

        # Get the publisher date
        try:
            #pubdate_node = root.xpath('(//dl[@class="product-info-list"]//dd)[2]') # Format dd-mm-yyyy
            pubdate_node = root.xpath(
                '//div[@class="product-page-block__container"]//dd'
            )  # Format dd-mm-yyyy
            date_str = pubdate_node[0].text.strip()
            format_str = '%d-%m-%Y'  # The format
            self.pubdate = datetime.datetime.strptime(date_str, format_str)
        except:
            self.log.exception('Error parsing published date for url: %r' %
                               self.url)

        # Setup the metadata
        meta_data = Metadata(self.title, self.authors)
        meta_data.set_identifier('isbn', self.isbn)
        meta_data.set_identifier('saxo', self.url)

        # Set rating
        if self.rating:
            try:
                meta_data.rating = self.rating
            except:
                self.log.exception('Error loading rating')
        # Set ISBN
        if self.isbn:
            try:
                meta_data.isbn = self.isbn
            except:
                self.log.exception('Error loading ISBN')
        # Set relevance
        if self.relevance:
            try:
                meta_data.source_relevance = self.relevance
            except:
                self.log.exception('Error loading relevance')
        # Set cover url
        if self.cover_url:
            try:
                meta_data.cover_url = self.cover_url
            except:
                self.log.exception('Error loading cover_url')
        # Set publisher
        if self.publisher:
            try:
                meta_data.publisher = self.publisher
            except:
                self.log.exception('Error loading publisher')
        # Set language
        if self.language:
            try:
                meta_data.language = self.language
            except:
                self.log.exception('Error loading language')
        # Set comments/blurb
        if self.comments:
            try:
                meta_data.comments = self.comments
            except:
                self.log.exception("Error loading comments")
        # Set publisher data
        if self.pubdate:
            try:
                meta_data.pubdate = self.pubdate
            except:
                self.log.exception('Error loading pubdate')

        # Put meta data
        self.plugin.clean_downloaded_metadata(meta_data)
        self.result_queue.put(meta_data)