Python html2text 예제들, calibre.utils.html2text.html2text Python 예제들

예제 #1

0

파일 보기

파일: comments.py 프로젝트: GRiker/calibre

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown

    text = html2text(html)
    md = Markdown(safe_mode="remove")
    cleansed = re.sub("\n+", "", md.convert(text))
    return cleansed

예제 #2

0

파일 보기

    def convert_calibre_md_to_comic_md(self):
        '''
        Maps the entries in the calibre metadata to comictagger metadata
        '''
        from calibre.utils.html2text import html2text
        from calibre.utils.date import UNDEFINED_DATE
        from calibre.utils.localization import lang_as_iso639_1

        if self.calibre_md_in_comic_format:
            return

        self.calibre_md_in_comic_format = GenericMetadata()
        mi = self.calibre_metadata

        # shorten some functions
        role = partial(set_role,
                       credits=self.calibre_md_in_comic_format.credits)
        update_field = partial(update_comic_field,
                               target=self.calibre_md_in_comic_format)

        # update the fields of comic metadata
        update_field("title", mi.title)
        role("Writer", mi.authors)
        update_field("series", mi.series)
        update_field("issue", mi.series_index)
        update_field("tags", mi.tags)
        update_field("publisher", mi.publisher)
        update_field("criticalRating", mi.rating)
        # need to check for None
        if mi.comments:
            update_field("comments", html2text(mi.comments))
        if mi.language:
            update_field("language", lang_as_iso639_1(mi.language))
        if mi.pubdate != UNDEFINED_DATE:
            update_field("year", mi.pubdate.year)
            update_field("month", mi.pubdate.month)
            update_field("day", mi.pubdate.day)

        # custom columns
        field = partial(self.db.field_for, book_id=self.book_id)

        # artists
        role("Penciller", field(prefs['penciller_column']))
        role("Inker", field(prefs['inker_column']))
        role("Colorist", field(prefs['colorist_column']))
        role("Letterer", field(prefs['letterer_column']))
        role("CoverArtist", field(prefs['cover_artist_column']))
        role("Editor", field(prefs['editor_column']))
        # others
        update_field("storyArc", field(prefs['storyarc_column']))
        update_field("characters", field(prefs['characters_column']))
        update_field("teams", field(prefs['teams_column']))
        update_field("locations", field(prefs['locations_column']))
        update_field("volume", field(prefs['volume_column']))
        update_field("genre", field(prefs['genre_column']))
        update_field("issueCount", field(prefs['count_column']))
        update_field("pageCount", field(prefs['pages_column']))
        update_field("webLink", get_link(field(prefs['comicvine_column'])))
        update_field("manga", field(prefs['manga_column']))

예제 #3

0

파일 보기

파일: comments.py 프로젝트: jonesgithub/calibre

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown
    import bleach
    text = html2text(html)
    md = Markdown()
    html = md.convert(text)
    cleansed = re.sub(u'\n+', u'', bleach.clean(html))
    return cleansed

예제 #4

0

파일 보기

파일: comments.py 프로젝트: kerasking/calibre

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown
    import bleach
    text = html2text(html)
    md = Markdown()
    html = md.convert(text)
    cleansed = re.sub(u'\n+', u'', bleach.clean(html))
    return cleansed

예제 #5

0

파일 보기

파일: utils.py 프로젝트: JimmXinu/calibre

 def chapter_head(self, match):
     from calibre.utils.html2text import html2text
     chap = match.group('chap')
     title = match.group('title')
     if not title:
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
                 " chapters. - " + unicode_type(chap))
         return '<h2>'+chap+'</h2>\n'
     else:
         delete_whitespace = re.compile('^\\s*(?P<c>.*?)\\s*$')
         delete_quotes = re.compile('\'\"')
         txt_chap = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(chap)))
         txt_title = delete_quotes.sub('', delete_whitespace.sub('\\g<c>', html2text(title)))
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log.debug("marked " + unicode_type(self.html_preprocess_sections) +
                 " chapters & titles. - " + unicode_type(chap) + ", " + unicode_type(title))
         return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'

예제 #6

0

파일 보기

 def chapter_head(self, match):
     from calibre.utils.html2text import html2text
     chap = match.group('chap')
     title = match.group('title')
     if not title:
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                 " chapters. - " + unicode(chap))
         return '<h2>'+chap+'</h2>\n'
     else:
         delete_whitespace = re.compile('^\s*(?P<c>.*?)\s*$')
         delete_quotes = re.compile('\'\"')
         txt_chap = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(chap)))
         txt_title = delete_quotes.sub('', delete_whitespace.sub('\g<c>', html2text(title)))
         self.html_preprocess_sections = self.html_preprocess_sections + 1
         self.log.debug("marked " + unicode(self.html_preprocess_sections) +
                 " chapters & titles. - " + unicode(chap) + ", " + unicode(title))
         return '<h2 title="'+txt_chap+', '+txt_title+'">'+chap+'</h2>\n<h3 class="sigilNotInTOC">'+title+'</h3>\n'

예제 #7

0

파일 보기

파일: comicmetadata.py 프로젝트: dickloraine/EmbedComicMetadata

    def convert_calibre_md_to_comic_md(self):
        '''
        Maps the entries in the calibre metadata to comictagger metadata
        '''
        from calibre.utils.html2text import html2text
        from calibre.utils.date import UNDEFINED_DATE
        from calibre.utils.localization import lang_as_iso639_1

        if self.calibre_md_in_comic_format:
            return

        self.calibre_md_in_comic_format = GenericMetadata()
        mi = self.calibre_metadata

        # shorten some functions
        role = partial(set_role, credits=self.calibre_md_in_comic_format.credits)
        update_field = partial(update_comic_field, target=self.calibre_md_in_comic_format)

        # update the fields of comic metadata
        update_field("title", mi.title)
        role("Writer", mi.authors)
        update_field("series", mi.series)
        update_field("issue", mi.series_index)
        update_field("tags", mi.tags)
        update_field("publisher", mi.publisher)
        update_field("criticalRating", mi.rating)
        # need to check for None
        if mi.comments:
            update_field("comments", html2text(mi.comments))
        if mi.language:
            update_field("language", lang_as_iso639_1(mi.language))
        if mi.pubdate != UNDEFINED_DATE:
            update_field("year", mi.pubdate.year)
            update_field("month", mi.pubdate.month)
            update_field("day", mi.pubdate.day)

        # custom columns
        field = partial(self.db.field_for, book_id=self.book_id)

        # artists
        role("Penciller", field(prefs['penciller_column']))
        role("Inker", field(prefs['inker_column']))
        role("Colorist", field(prefs['colorist_column']))
        role("Letterer", field(prefs['letterer_column']))
        role("CoverArtist", field(prefs['cover_artist_column']))
        role("Editor", field(prefs['editor_column']))
        # others
        update_field("storyArc", field(prefs['storyarc_column']))
        update_field("characters", field(prefs['characters_column']))
        update_field("teams", field(prefs['teams_column']))
        update_field("locations", field(prefs['locations_column']))
        update_field("volume", field(prefs['volume_column']))
        update_field("genre", field(prefs['genre_column']))

예제 #8

0

파일 보기

파일: utils.py 프로젝트: qykth-git/calibre-debian

    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with
        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
        a style with width attributes in the <hr> tag then the appropriate margins are
        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
        All other html is converted to text.
        '''
        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
        if re.findall('(<|>)', replacement_break):
            if re.match('^<hr', replacement_break):
                if replacement_break.find('width') != -1:
                    try:
                        width = int(
                            re.sub('.*?width(:|=)(?P<wnum>\\d+).*',
                                   '\\g<wnum>', replacement_break))
                    except:
                        scene_break = hr_open + '<hr style="height: 3px; background:#505050" /></div>'
                        self.log.warn('Invalid replacement scene break'
                                      ' expression, using default')
                    else:
                        replacement_break = re.sub(
                            '(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)',
                            '', replacement_break)
                        divpercent = (100 - width) // 2
                        hr_open = re.sub('45', unicode_type(divpercent),
                                         hr_open)
                        scene_break = hr_open + replacement_break + '</div>'
                else:
                    scene_break = hr_open + '<hr style="height: 3px; background:#505050" /></div>'
            elif re.match('^<img', replacement_break):
                scene_break = self.scene_break_open + replacement_break + '</p>'
            else:
                from calibre.utils.html2text import html2text
                replacement_break = html2text(replacement_break)
                replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
                scene_break = self.scene_break_open + replacement_break + '</p>'
        else:
            replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
            scene_break = self.scene_break_open + replacement_break + '</p>'

        return scene_break

예제 #9

0

파일 보기

파일: utils.py 프로젝트: JimmXinu/calibre

    def markup_user_break(self, replacement_break):
        '''
        Takes string a user supplies and wraps it in markup that will be centered with
        appropriate margins.  <hr> and <img> tags are allowed.  If the user specifies
        a style with width attributes in the <hr> tag then the appropriate margins are
        applied to wrapping divs.  This is because many ebook devices don't support margin:auto
        All other html is converted to text.
        '''
        hr_open = '<div id="scenebreak" style="margin-left: 45%; margin-right: 45%; margin-top:1.5em; margin-bottom:1.5em; page-break-before:avoid">'
        if re.findall('(<|>)', replacement_break):
            if re.match('^<hr', replacement_break):
                if replacement_break.find('width') != -1:
                    try:
                        width = int(re.sub('.*?width(:|=)(?P<wnum>\\d+).*', '\\g<wnum>', replacement_break))
                    except:
                        scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
                        self.log.warn('Invalid replacement scene break'
                                ' expression, using default')
                    else:
                        replacement_break = re.sub('(?i)(width=\\d+\\%?|width:\\s*\\d+(\\%|px|pt|em)?;?)', '', replacement_break)
                        divpercent = (100 - width) / 2
                        hr_open = re.sub('45', unicode_type(divpercent), hr_open)
                        scene_break = hr_open+replacement_break+'</div>'
                else:
                    scene_break = hr_open+'<hr style="height: 3px; background:#505050" /></div>'
            elif re.match('^<img', replacement_break):
                scene_break = self.scene_break_open+replacement_break+'</p>'
            else:
                from calibre.utils.html2text import html2text
                replacement_break = html2text(replacement_break)
                replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
                scene_break = self.scene_break_open+replacement_break+'</p>'
        else:
            replacement_break = re.sub('\\s', '&nbsp;', replacement_break)
            scene_break = self.scene_break_open+replacement_break+'</p>'

        return scene_break

예제 #10

0

파일 보기

파일: csv_xml.py 프로젝트: MarioJC/calibre

    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from calibre.library import current_library_name
        from calibre.utils.date import isoformat
        from calibre.utils.html2text import html2text
        from calibre.utils.logging import default_log as log
        from lxml import etree
        from calibre.ebooks.metadata import authors_to_string

        self.fmt = path_to_output.rpartition('.')[2]
        self.notification = notification
        current_library = current_library_name()
        if getattr(opts, 'library_path', None):
            current_library = os.path.basename(opts.library_path)

        if opts.verbose:
            opts_dict = vars(opts)
            log("%s('%s'): Generating %s" % (self.name, current_library, self.fmt.upper()))
            if opts.connected_device['is_device_connected']:
                log(" connected_device: %s" % opts.connected_device['name'])
            if opts_dict['search_text']:
                log(" --search='%s'" % opts_dict['search_text'])

            if opts_dict['ids']:
                log(" Book count: %d" % len(opts_dict['ids']))
                if opts_dict['search_text']:
                    log(" (--search ignored when a subset of the database is specified)")

            if opts_dict['fields']:
                if opts_dict['fields'] == 'all':
                    log(" Fields: %s" % ', '.join(FIELDS[1:]))
                else:
                    log(" Fields: %s" % opts_dict['fields'])

        # If a list of ids are provided, don't use search_text
        if opts.ids:
            opts.search_text = None

        data = self.search_sort_db(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)
            # raise SystemExit(1)

        # Get the requested output fields as a list
        fields = self.get_output_fields(db, opts)

        # If connected device, add 'On Device' values to data
        if opts.connected_device['is_device_connected'] and 'ondevice' in fields:
            for entry in data:
                entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[entry['id']]['ondevice']

        fm = {x: db.field_metadata.get(x, {}) for x in fields}

        if self.fmt == 'csv':
            outfile = codecs.open(path_to_output, 'w', 'utf8')

            # Write a UTF-8 BOM
            outfile.write('\xef\xbb\xbf')

            # Output the field headers
            outfile.write(u'%s\n' % u','.join(fields))

            # Output the entry fields
            for entry in data:
                outstr = []
                for field in fields:
                    if field.startswith('#'):
                        item = db.get_field(entry['id'], field, index_is_id=True)
                        if isinstance(item, (list, tuple)):
                            if fm.get(field, {}).get('display', {}).get('is_names', False):
                                item = ' & '.join(item)
                            else:
                                item = ', '.join(item)
                    elif field == 'library_name':
                        item = current_library
                    elif field == 'title_sort':
                        item = entry['sort']
                    else:
                        item = entry[field]

                    if item is None:
                        outstr.append('""')
                        continue
                    elif field == 'formats':
                        fmt_list = []
                        for format in item:
                            fmt_list.append(format.rpartition('.')[2].lower())
                        item = ', '.join(fmt_list)
                    elif field == 'authors':
                        item = authors_to_string(item)
                    elif field == 'tags':
                        item = ', '.join(item)
                    elif field == 'isbn':
                        # Could be 9, 10 or 13 digits, with hyphens, possibly ending in 'X'
                        item = u'%s' % re.sub(r'[^\dX-]', '', item)
                    elif fm.get(field, {}).get('datatype') == 'datetime':
                        item = isoformat(item, as_utc=False)
                    elif field == 'comments':
                        item = item.replace(u'\r\n', u' ')
                        item = item.replace(u'\n', u' ')
                    elif fm.get(field, {}).get('datatype', None) == 'rating' and item:
                        item = u'%.2g' % (item / 2.0)

                    # Convert HTML to markdown text
                    if type(item) is unicode:
                        opening_tag = re.search('<(\w+)(\x20|>)', item)
                        if opening_tag:
                            closing_tag = re.search('<\/%s>$' % opening_tag.group(1), item)
                            if closing_tag:
                                item = html2text(item)

                    outstr.append(u'"%s"' % unicode(item).replace('"', '""'))

                outfile.write(u','.join(outstr) + u'\n')
            outfile.close()

        elif self.fmt == 'xml':
            from lxml.builder import E

            root = E.calibredb()
            for r in data:
                record = E.record()
                root.append(record)

                for field in fields:
                    if field.startswith('#'):
                        val = db.get_field(r['id'], field, index_is_id=True)
                        if not isinstance(val, (str, unicode)):
                            val = unicode(val)
                        item = getattr(E, field.replace('#', '_'))(val)
                        record.append(item)

                for field in ('id', 'uuid', 'publisher', 'rating', 'size',
                              'isbn', 'ondevice', 'identifiers'):
                    if field in fields:
                        val = r[field]
                        if not val:
                            continue
                        if not isinstance(val, (str, unicode)):
                            if (fm.get(field, {}).get('datatype', None) ==
                                    'rating' and val):
                                val = u'%.2g' % (val / 2.0)
                            val = unicode(val)
                        item = getattr(E, field)(val)
                        record.append(item)

                if 'title' in fields:
                    title = E.title(r['title'], sort=r['sort'])
                    record.append(title)

                if 'authors' in fields:
                    aus = E.authors(sort=r['author_sort'])
                    for au in r['authors']:
                        aus.append(E.author(au))
                    record.append(aus)

                for field in ('timestamp', 'pubdate'):
                    if field in fields:
                        record.append(getattr(E, field)(isoformat(r[field], as_utc=False)))

                if 'tags' in fields and r['tags']:
                    tags = E.tags()
                    for tag in r['tags']:
                        tags.append(E.tag(tag))
                    record.append(tags)

                if 'comments' in fields and r['comments']:
                    record.append(E.comments(r['comments']))

                if 'series' in fields and r['series']:
                    record.append(E.series(r['series'],
                        index=str(r['series_index'])))

                if 'cover' in fields and r['cover']:
                    record.append(E.cover(r['cover'].replace(os.sep, '/')))

                if 'formats' in fields and r['formats']:
                    fmt = E.formats()
                    for f in r['formats']:
                        fmt.append(E.format(f.replace(os.sep, '/')))
                    record.append(fmt)

                if 'library_name' in fields:
                    record.append(E.library_name(current_library))

            with open(path_to_output, 'w') as f:
                f.write(etree.tostring(root, encoding='utf-8',
                    xml_declaration=True, pretty_print=True))

예제 #11

0

파일 보기

파일: email.py 프로젝트: davidfor/calibre

    def send_by_mail(self, to, fmts, delete_from_library, subject='', send_ids=None,
            do_auto_convert=True, specific_format=None):
        ids = [self.library_view.model().id(r) for r in self.library_view.selectionModel().selectedRows()] if send_ids is None else send_ids
        if not ids or len(ids) == 0:
            return

        files, _auto_ids = self.library_view.model().get_preferred_formats_from_ids(ids,
                                    fmts, set_metadata=True,
                                    specific_format=specific_format,
                                    exclude_auto=do_auto_convert,
                                    use_plugboard=plugboard_email_value,
                                    plugboard_formats=plugboard_email_formats)
        if do_auto_convert:
            nids = list(set(ids).difference(_auto_ids))
            ids = [i for i in ids if i in nids]
        else:
            _auto_ids = []

        full_metadata = self.library_view.model().metadata_for(ids,
                get_cover=False)

        bad, remove_ids, jobnames = [], [], []
        texts, subjects, attachments, attachment_names = [], [], [], []
        for f, mi, id in zip(files, full_metadata, ids):
            t = mi.title
            if not t:
                t = _('Unknown')
            if f is None:
                bad.append(t)
            else:
                remove_ids.append(id)
                jobnames.append(t)
                attachments.append(f)
                if not subject:
                    subjects.append(_('E-book:')+ ' '+t)
                else:
                    components = get_components(subject, mi, id)
                    if not components:
                        components = [mi.title]
                    subjects.append(os.path.join(*components))
                a = authors_to_string(mi.authors if mi.authors else
                        [_('Unknown')])
                texts.append(_('Attached, you will find the e-book') +
                        '\n\n' + t + '\n\t' + _('by') + ' ' + a + '\n\n' +
                        _('in the %s format.') %
                        os.path.splitext(f)[1][1:].upper())
                if mi.comments and gprefs['add_comments_to_email']:
                    from calibre.utils.html2text import html2text
                    texts[-1] += '\n\n' + _('About this book:') + '\n\n' + textwrap.fill(html2text(mi.comments))
                prefix = ascii_filename(t+' - '+a)
                if not isinstance(prefix, unicode):
                    prefix = prefix.decode(preferred_encoding, 'replace')
                attachment_names.append(prefix + os.path.splitext(f)[1])
        remove = remove_ids if delete_from_library else []

        to_s = list(repeat(to, len(attachments)))
        if attachments:
            send_mails(jobnames,
                    Dispatcher(partial(self.email_sent, remove=remove)),
                    attachments, to_s, subjects, texts, attachment_names,
                    self.job_manager)
            self.status_bar.show_message(_('Sending email to')+' '+to, 3000)

        auto = []
        if _auto_ids != []:
            for id in _auto_ids:
                if specific_format is None:
                    dbfmts = self.library_view.model().db.formats(id, index_is_id=True)
                    formats = [f.lower() for f in (dbfmts.split(',') if dbfmts else
                        [])]
                    if list(set(formats).intersection(available_input_formats())) != [] and list(set(fmts).intersection(available_output_formats())) != []:
                        auto.append(id)
                    else:
                        bad.append(self.library_view.model().db.title(id, index_is_id=True))
                else:
                    if specific_format in list(set(fmts).intersection(set(available_output_formats()))):
                        auto.append(id)
                    else:
                        bad.append(self.library_view.model().db.title(id, index_is_id=True))

        if auto != []:
            format = specific_format if specific_format in list(set(fmts).intersection(set(available_output_formats()))) else None
            if not format:
                for fmt in fmts:
                    if fmt in list(set(fmts).intersection(set(available_output_formats()))):
                        format = fmt
                        break
            if format is None:
                bad += auto
            else:
                autos = [self.library_view.model().db.title(id, index_is_id=True) for id in auto]
                if self.auto_convert_question(
                    _('Auto convert the following books to %s before sending via '
                        'email?') % format.upper(), autos):
                    self.iactions['Convert Books'].auto_convert_mail(to, fmts, delete_from_library, auto, format, subject)

        if bad:
            bad = '\n'.join('%s'%(i,) for i in bad)
            d = warning_dialog(self, _('No suitable formats'),
                _('Could not email the following books '
                'as no suitable formats were found:'), bad)
            d.exec_()

예제 #12

0

파일 보기

파일: comments.py 프로젝트: siebert/calibre

def sanitize_comments_html(html):
    text = html2text(html)
    md = Markdown(safe_mode='remove')
    cleansed = re.sub('\n+', '', md.convert(text))
    return cleansed

예제 #13

0

파일 보기

파일: comments.py 프로젝트: artbycrunk/calibre

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown
    text = html2text(html)
    md = Markdown()
    html = md.convert(text)
    return html

예제 #14

0

파일 보기

def sanitize_comments_html(html):
    text = html2text(html)
    md = markdown.Markdown(safe_mode=True)
    cleansed = re.sub('\n+', '', md.convert(text))
    cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '')
    return cleansed

예제 #15

0

파일 보기

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown
    text = html2text(html)
    md = Markdown(safe_mode='remove')
    cleansed = re.sub('\n+', '', md.convert(text))
    return cleansed

예제 #16

0

파일 보기

def _set_comments(title_info, mi, ctx):
    if not mi.is_null('comments'):
        from calibre.utils.html2text import html2text
        ctx.clear_meta_tags(title_info, 'annotation')
        title = ctx.get_or_create(title_info, 'annotation')
        ctx.text2fb2(title, html2text(mi.comments))

예제 #17

0

파일 보기

파일: comments.py 프로젝트: BobPyron/calibre

def sanitize_comments_html(html):
    text = html2text(html)
    md = markdown.Markdown(safe_mode=True)
    cleansed = re.sub('\n+', '', md.convert(text))
    cleansed = cleansed.replace(markdown.HTML_REMOVED_TEXT, '')
    return cleansed

예제 #18

0

파일 보기

파일: bibtex.py 프로젝트: AEliu/calibre

        def create_bibtex_entry(entry, fields, mode, template_citation,
                                    bibtexdict, db, citation_bibtex=True, calibre_files=True):

            #Bibtex doesn't like UTF-8 but keep unicode until writing
            #Define starting chain or if book valid strict and not book return a Fail string

            bibtex_entry = []
            if mode != "misc" and check_entry_book_valid(entry) :
                bibtex_entry.append(u'@book{')
            elif mode != "book" :
                bibtex_entry.append(u'@misc{')
            else :
                #case strict book
                return ''

            if citation_bibtex :
                # Citation tag
                bibtex_entry.append(make_bibtex_citation(entry, template_citation,
                    bibtexdict))
                bibtex_entry = [u' '.join(bibtex_entry)]

            for field in fields:
                if field.startswith('#'):
                    item = db.get_field(entry['id'],field,index_is_id=True)
                    if isinstance(item, (bool, float, int)):
                        item = repr(item)
                elif field == 'title_sort':
                    item = entry['sort']
                elif field == 'library_name':
                    item = library_name
                else:
                    item = entry[field]

                #check if the field should be included (none or empty)
                if item is None:
                    continue
                try:
                    if len(item) == 0 :
                        continue
                except TypeError:
                    pass

                if field == 'authors' :
                    bibtex_entry.append(u'author = "%s"' % bibtexdict.bibtex_author_format(item))

                elif field == 'id' :
                    bibtex_entry.append(u'calibreid = "%s"' % int(item))

                elif field == 'rating' :
                    bibtex_entry.append(u'rating = "%s"' % int(item))

                elif field == 'size' :
                    bibtex_entry.append(u'%s = "%s octets"' % (field, int(item)))

                elif field == 'tags' :
                    #A list to flatten
                    bibtex_entry.append(u'tags = "%s"' % bibtexdict.utf8ToBibtex(u', '.join(item)))

                elif field == 'comments' :
                    #\n removal
                    item = item.replace(u'\r\n',u' ')
                    item = item.replace(u'\n',u' ')
                    # unmatched brace removal (users should use \leftbrace or \rightbrace for single braces)
                    item = bibtexdict.stripUnmatchedSyntax(item, u'{', u'}')
                    #html to text
                    try:
                        item = html2text(item)
                    except:
                        log.warn("Failed to convert comments to text")
                    bibtex_entry.append(u'note = "%s"' % bibtexdict.utf8ToBibtex(item))

                elif field == 'isbn' :
                    # Could be 9, 10 or 13 digits
                    bibtex_entry.append(u'isbn = "%s"' % format_isbn(item))

                elif field == 'formats' :
                    #Add file path if format is selected
                    formats = [format.rpartition('.')[2].lower() for format in item]
                    bibtex_entry.append(u'formats = "%s"' % u', '.join(formats))
                    if calibre_files:
                        files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\
                            for format in item]
                        bibtex_entry.append(u'file = "%s"' % u', '.join(files))

                elif field == 'series_index' :
                    bibtex_entry.append(u'volume = "%s"' % int(item))

                elif field == 'timestamp' :
                    bibtex_entry.append(u'timestamp = "%s"' % isoformat(item).partition('T')[0])

                elif field == 'pubdate' :
                    bibtex_entry.append(u'year = "%s"' % item.year)
                    bibtex_entry.append(u'month = "%s"' % bibtexdict.utf8ToBibtex(strftime("%b", item)))

                elif field.startswith('#') and isinstance(item, basestring):
                    bibtex_entry.append(u'custom_%s = "%s"' % (field[1:],
                        bibtexdict.utf8ToBibtex(item)))

                elif isinstance(item, basestring):
                    # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
                        # 'author_sort', 'series', 'title_sort'] :
                    bibtex_entry.append(u'%s = "%s"' % (field, bibtexdict.utf8ToBibtex(item)))

            bibtex_entry = u',\n    '.join(bibtex_entry)
            bibtex_entry += u' }\n\n'

            return bibtex_entry

예제 #19

0

파일 보기

    def fb2_header(self):
        from calibre.ebooks.oeb.base import OPF
        metadata = {}
        metadata['title'] = self.oeb_book.metadata.title[0].value
        metadata['appname'] = __appname__
        metadata['version'] = __version__
        metadata['date'] = '%i.%i.%i' % (
            datetime.now().day, datetime.now().month, datetime.now().year)
        if self.oeb_book.metadata.language:
            lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
            if not lc:
                lc = self.oeb_book.metadata.language[0].value
            metadata['lang'] = lc or 'en'
        else:
            metadata['lang'] = u'en'
        metadata['id'] = None
        metadata['cover'] = self.get_cover()
        metadata['genre'] = self.opts.fb2_genre

        metadata['author'] = ''
        for auth in self.oeb_book.metadata.creator:
            author_first = ''
            author_middle = ''
            author_last = ''
            author_parts = auth.value.split(' ')
            if len(author_parts) == 1:
                author_last = author_parts[0]
            elif len(author_parts) == 2:
                author_first = author_parts[0]
                author_last = author_parts[1]
            else:
                author_first = author_parts[0]
                author_middle = ' '.join(author_parts[1:-1])
                author_last = author_parts[-1]
            metadata['author'] += '<author>'
            metadata[
                'author'] += '<first-name>%s</first-name>' % prepare_string_for_xml(
                    author_first)
            if author_middle:
                metadata[
                    'author'] += '<middle-name>%s</middle-name>' % prepare_string_for_xml(
                        author_middle)
            metadata[
                'author'] += '<last-name>%s</last-name>' % prepare_string_for_xml(
                    author_last)
            metadata['author'] += '</author>'
        if not metadata['author']:
            metadata[
                'author'] = '<author><first-name></first-name><last-name></last-name></author>'

        metadata['keywords'] = ''
        tags = list(map(unicode_type, self.oeb_book.metadata.subject))
        if tags:
            tags = ', '.join(prepare_string_for_xml(x) for x in tags)
            metadata['keywords'] = '<keywords>%s</keywords>' % tags

        metadata['sequence'] = ''
        if self.oeb_book.metadata.series:
            index = '1'
            if self.oeb_book.metadata.series_index:
                index = self.oeb_book.metadata.series_index[0]
            metadata['sequence'] = '<sequence name="%s" number="%s"/>' % (
                prepare_string_for_xml(
                    '%s' % self.oeb_book.metadata.series[0]), index)

        year = publisher = isbn = ''
        identifiers = self.oeb_book.metadata['identifier']
        for x in identifiers:
            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(
                    x).startswith('urn:uuid:'):
                metadata['id'] = unicode_type(x).split(':')[-1]
                break
        if metadata['id'] is None:
            self.log.warn('No UUID identifier found')
            metadata['id'] = unicode_type(uuid.uuid4())

        try:
            date = self.oeb_book.metadata['date'][0]
        except IndexError:
            pass
        else:
            year = '<year>%s</year>' % prepare_string_for_xml(
                date.value.partition('-')[0])

        try:
            publisher = self.oeb_book.metadata['publisher'][0]
        except IndexError:
            pass
        else:
            publisher = '<publisher>%s</publisher>' % prepare_string_for_xml(
                publisher.value)

        for x in identifiers:
            if x.get(OPF('scheme'), None).lower() == 'isbn':
                isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value)

        metadata['year'], metadata['isbn'], metadata[
            'publisher'] = year, isbn, publisher
        for key, value in metadata.items():
            if key not in ('author', 'cover', 'sequence', 'keywords', 'year',
                           'publisher', 'isbn'):
                metadata[key] = prepare_string_for_xml(value)

        try:
            comments = self.oeb_book.metadata['description'][0]
        except Exception:
            metadata['comments'] = ''
        else:
            from calibre.utils.html2text import html2text
            metadata['comments'] = '<annotation><p>{}</p></annotation>'.format(
                prepare_string_for_xml(html2text(comments.value).strip()))

        # Keep the indentation level of the description the same as the body.
        header = textwrap.dedent('''\
            <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:l="http://www.w3.org/1999/xlink">
            <description>
                <title-info>
                    <genre>%(genre)s</genre>
                    %(author)s
                    <book-title>%(title)s</book-title>
                    %(cover)s
                    <lang>%(lang)s</lang>
                    %(keywords)s
                    %(sequence)s
                    %(comments)s
                </title-info>
                <document-info>
                    %(author)s
                    <program-used>%(appname)s %(version)s</program-used>
                    <date>%(date)s</date>
                    <id>%(id)s</id>
                    <version>1.0</version>
                </document-info>
                <publish-info>
                    %(publisher)s
                    %(year)s
                    %(isbn)s
                </publish-info>
            </description>''') % metadata

        # Remove empty lines.
        return '\n'.join(filter(unicode_type.strip, header.splitlines()))

예제 #20

0

파일 보기

파일: comments.py 프로젝트: 089git/calibre

def sanitize_comments_html(html):
    text = html2text(html)
    md = Markdown(safe_mode='remove')
    cleansed = re.sub('\n+', '', md.convert(text))
    return cleansed

예제 #21

0

파일 보기

        def create_bibtex_entry(entry,
                                fields,
                                mode,
                                template_citation,
                                bibtexdict,
                                db,
                                citation_bibtex=True,
                                calibre_files=True):

            #Bibtex doesn't like UTF-8 but keep unicode until writing
            #Define starting chain or if book valid strict and not book return a Fail string

            bibtex_entry = []
            if mode != "misc" and check_entry_book_valid(entry):
                bibtex_entry.append(u'@book{')
            elif mode != "book":
                bibtex_entry.append(u'@misc{')
            else:
                #case strict book
                return ''

            if citation_bibtex:
                # Citation tag
                bibtex_entry.append(
                    make_bibtex_citation(entry, template_citation, bibtexdict))
                bibtex_entry = [u' '.join(bibtex_entry)]

            for field in fields:
                if field.startswith('#'):
                    item = db.get_field(entry['id'], field, index_is_id=True)
                    if isinstance(item, (bool, float, int)):
                        item = repr(item)
                elif field == 'title_sort':
                    item = entry['sort']
                elif field == 'library_name':
                    item = library_name
                else:
                    item = entry[field]

                #check if the field should be included (none or empty)
                if item is None:
                    continue
                try:
                    if len(item) == 0:
                        continue
                except TypeError:
                    pass

                if field == 'authors':
                    bibtex_entry.append(u'author = "%s"' %
                                        bibtexdict.bibtex_author_format(item))

                elif field == 'id':
                    bibtex_entry.append(u'calibreid = "%s"' % int(item))

                elif field == 'rating':
                    bibtex_entry.append(u'rating = "%s"' % int(item))

                elif field == 'size':
                    bibtex_entry.append(u'%s = "%s octets"' %
                                        (field, int(item)))

                elif field == 'tags':
                    #A list to flatten
                    bibtex_entry.append(
                        u'tags = "%s"' %
                        bibtexdict.utf8ToBibtex(u', '.join(item)))

                elif field == 'comments':
                    #\n removal
                    item = item.replace(u'\r\n', u' ')
                    item = item.replace(u'\n', u' ')
                    # unmatched brace removal (users should use \leftbrace or \rightbrace for single braces)
                    item = bibtexdict.stripUnmatchedSyntax(item, u'{', u'}')
                    #html to text
                    try:
                        item = html2text(item)
                    except:
                        log.warn("Failed to convert comments to text")
                    bibtex_entry.append(u'note = "%s"' %
                                        bibtexdict.utf8ToBibtex(item))

                elif field == 'isbn':
                    # Could be 9, 10 or 13 digits
                    bibtex_entry.append(u'isbn = "%s"' % format_isbn(item))

                elif field == 'formats':
                    #Add file path if format is selected
                    formats = [
                        format.rpartition('.')[2].lower() for format in item
                    ]
                    bibtex_entry.append(u'formats = "%s"' %
                                        u', '.join(formats))
                    if calibre_files:
                        files = [u':%s:%s' % (format, format.rpartition('.')[2].upper())\
                            for format in item]
                        bibtex_entry.append(u'file = "%s"' % u', '.join(files))

                elif field == 'series_index':
                    bibtex_entry.append(u'volume = "%s"' % int(item))

                elif field == 'timestamp':
                    bibtex_entry.append(u'timestamp = "%s"' %
                                        isoformat(item).partition('T')[0])

                elif field == 'pubdate':
                    bibtex_entry.append(u'year = "%s"' % item.year)
                    bibtex_entry.append(
                        u'month = "%s"' %
                        bibtexdict.utf8ToBibtex(strftime("%b", item)))

                elif field.startswith('#') and isinstance(item, basestring):
                    bibtex_entry.append(
                        u'custom_%s = "%s"' %
                        (field[1:], bibtexdict.utf8ToBibtex(item)))

                elif isinstance(item, basestring):
                    # elif field in ['title', 'publisher', 'cover', 'uuid', 'ondevice',
                    # 'author_sort', 'series', 'title_sort'] :
                    bibtex_entry.append(u'%s = "%s"' %
                                        (field, bibtexdict.utf8ToBibtex(item)))

            bibtex_entry = u',\n    '.join(bibtex_entry)
            bibtex_entry += u' }\n\n'

            return bibtex_entry

예제 #22

0

파일 보기

def sanitize_comments_html(html):
    from calibre.ebooks.markdown import Markdown
    text = html2text(html)
    md = Markdown()
    html = md.convert(text)
    return html

예제 #23

0

파일 보기

파일: fb2ml.py 프로젝트: j-howell/calibre

    def fb2_header(self):
        from calibre.ebooks.oeb.base import OPF
        metadata = {}
        metadata['title'] = self.oeb_book.metadata.title[0].value
        metadata['appname'] = __appname__
        metadata['version'] = __version__
        metadata['date'] = '%i.%i.%i' % (datetime.now().day, datetime.now().month, datetime.now().year)
        if self.oeb_book.metadata.language:
            lc = lang_as_iso639_1(self.oeb_book.metadata.language[0].value)
            if not lc:
                lc = self.oeb_book.metadata.language[0].value
            metadata['lang'] = lc or 'en'
        else:
            metadata['lang'] = u'en'
        metadata['id'] = None
        metadata['cover'] = self.get_cover()
        metadata['genre'] = self.opts.fb2_genre

        metadata['author'] = u''
        for auth in self.oeb_book.metadata.creator:
            author_first = u''
            author_middle = u''
            author_last = u''
            author_parts = auth.value.split(' ')
            if len(author_parts) == 1:
                author_last = author_parts[0]
            elif len(author_parts) == 2:
                author_first = author_parts[0]
                author_last = author_parts[1]
            else:
                author_first = author_parts[0]
                author_middle = ' '.join(author_parts[1:-1])
                author_last = author_parts[-1]
            metadata['author'] += '<author>'
            metadata['author'] += '<first-name>%s</first-name>' % prepare_string_for_xml(author_first)
            if author_middle:
                metadata['author'] += '<middle-name>%s</middle-name>' % prepare_string_for_xml(author_middle)
            metadata['author'] += '<last-name>%s</last-name>' % prepare_string_for_xml(author_last)
            metadata['author'] += '</author>'
        if not metadata['author']:
            metadata['author'] = u'<author><first-name></first-name><last-name></last-name></author>'

        metadata['keywords'] = u''
        tags = list(map(unicode_type, self.oeb_book.metadata.subject))
        if tags:
            tags = ', '.join(prepare_string_for_xml(x) for x in tags)
            metadata['keywords'] = '<keywords>%s</keywords>'%tags

        metadata['sequence'] = u''
        if self.oeb_book.metadata.series:
            index = '1'
            if self.oeb_book.metadata.series_index:
                index = self.oeb_book.metadata.series_index[0]
            metadata['sequence'] = u'<sequence name="%s" number="%s" />' % (prepare_string_for_xml(u'%s' % self.oeb_book.metadata.series[0]), index)

        year = publisher = isbn = u''
        identifiers = self.oeb_book.metadata['identifier']
        for x in identifiers:
            if x.get(OPF('scheme'), None).lower() == 'uuid' or unicode_type(x).startswith('urn:uuid:'):
                metadata['id'] = unicode_type(x).split(':')[-1]
                break
        if metadata['id'] is None:
            self.log.warn('No UUID identifier found')
            metadata['id'] = str(uuid.uuid4())

        try:
            date = self.oeb_book.metadata['date'][0]
        except IndexError:
            pass
        else:
            year = '<year>%s</year>' % prepare_string_for_xml(date.value.partition('-')[0])

        try:
            publisher = self.oeb_book.metadata['publisher'][0]
        except IndexError:
            pass
        else:
            publisher = '<publisher>%s</publisher>' % prepare_string_for_xml(publisher.value)

        for x in identifiers:
            if x.get(OPF('scheme'), None).lower() == 'isbn':
                isbn = '<isbn>%s</isbn>' % prepare_string_for_xml(x.value)

        metadata['year'], metadata['isbn'], metadata['publisher'] = year, isbn, publisher
        for key, value in metadata.items():
            if key not in ('author', 'cover', 'sequence', 'keywords', 'year', 'publisher', 'isbn'):
                metadata[key] = prepare_string_for_xml(value)

        try:
            comments = self.oeb_book.metadata['description'][0]
        except Exception:
            metadata['comments'] = ''
        else:
            from calibre.utils.html2text import html2text
            metadata['comments'] = '<annotation>{}</annotation>'.format(prepare_string_for_xml(html2text(comments.value.strip())))

        return textwrap.dedent(u'''
            <FictionBook xmlns="http://www.gribuser.ru/xml/fictionbook/2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
                <description>
                    <title-info>
                        <genre>%(genre)s</genre>
                            %(author)s
                        <book-title>%(title)s</book-title>
                        %(cover)s
                        <lang>%(lang)s</lang>
                        %(keywords)s
                        %(sequence)s
                        %(comments)s
                    </title-info>
                    <document-info>
                        %(author)s
                        <program-used>%(appname)s %(version)s</program-used>
                        <date>%(date)s</date>
                        <id>%(id)s</id>
                        <version>1.0</version>
                    </document-info>
                    <publish-info>
                        %(publisher)s
                        %(year)s
                        %(isbn)s
                    </publish-info>
                </description>\n''') % metadata

예제 #24

0

파일 보기

    def send_by_mail(self,
                     to,
                     fmts,
                     delete_from_library,
                     subject='',
                     send_ids=None,
                     do_auto_convert=True,
                     specific_format=None):
        ids = [
            self.library_view.model().id(r)
            for r in self.library_view.selectionModel().selectedRows()
        ] if send_ids is None else send_ids
        if not ids or len(ids) == 0:
            return

        files, _auto_ids = self.library_view.model(
        ).get_preferred_formats_from_ids(
            ids,
            fmts,
            set_metadata=True,
            specific_format=specific_format,
            exclude_auto=do_auto_convert,
            use_plugboard=plugboard_email_value,
            plugboard_formats=plugboard_email_formats)
        if do_auto_convert:
            nids = list(set(ids).difference(_auto_ids))
            ids = [i for i in ids if i in nids]
        else:
            _auto_ids = []

        full_metadata = self.library_view.model().metadata_for(ids,
                                                               get_cover=False)

        bad, remove_ids, jobnames = [], [], []
        texts, subjects, attachments, attachment_names = [], [], [], []
        for f, mi, id in zip(files, full_metadata, ids):
            t = mi.title
            if not t:
                t = _('Unknown')
            if f is None:
                bad.append(t)
            else:
                remove_ids.append(id)
                jobnames.append(t)
                attachments.append(f)
                if not subject:
                    subjects.append(_('E-book:') + ' ' + t)
                else:
                    components = get_components(subject, mi, id)
                    if not components:
                        components = [mi.title]
                    subjects.append(os.path.join(*components))
                a = authors_to_string(
                    mi.authors if mi.authors else [_('Unknown')])
                texts.append(
                    _('Attached, you will find the e-book') + '\n\n' + t +
                    '\n\t' + _('by') + ' ' + a + '\n\n' +
                    _('in the %s format.') %
                    os.path.splitext(f)[1][1:].upper())
                if mi.comments and gprefs['add_comments_to_email']:
                    from calibre.utils.html2text import html2text
                    texts[-1] += '\n\n' + _(
                        'About this book:') + '\n\n' + textwrap.fill(
                            html2text(mi.comments))
                prefix = ascii_filename(t + ' - ' + a)
                if not isinstance(prefix, unicode_type):
                    prefix = prefix.decode(preferred_encoding, 'replace')
                attachment_names.append(prefix + os.path.splitext(f)[1])
        remove = remove_ids if delete_from_library else []

        to_s = list(repeat(to, len(attachments)))
        if attachments:
            send_mails(jobnames,
                       Dispatcher(partial(self.email_sent,
                                          remove=remove)), attachments, to_s,
                       subjects, texts, attachment_names, self.job_manager)
            self.status_bar.show_message(
                _('Sending email to') + ' ' + to, 3000)

        auto = []
        if _auto_ids != []:
            for id in _auto_ids:
                if specific_format is None:
                    dbfmts = self.library_view.model().db.formats(
                        id, index_is_id=True)
                    formats = [
                        f.lower()
                        for f in (dbfmts.split(',') if dbfmts else [])
                    ]
                    if set(formats).intersection(available_input_formats(
                    )) and set(fmts).intersection(available_output_formats()):
                        auto.append(id)
                    else:
                        bad.append(self.library_view.model().db.title(
                            id, index_is_id=True))
                else:
                    if specific_format in list(
                            set(fmts).intersection(
                                set(available_output_formats()))):
                        auto.append(id)
                    else:
                        bad.append(self.library_view.model().db.title(
                            id, index_is_id=True))

        if auto != []:
            format = specific_format if specific_format in list(
                set(fmts).intersection(set(
                    available_output_formats()))) else None
            if not format:
                for fmt in fmts:
                    if fmt in list(
                            set(fmts).intersection(
                                set(available_output_formats()))):
                        format = fmt
                        break
            if format is None:
                bad += auto
            else:
                autos = [
                    self.library_view.model().db.title(id, index_is_id=True)
                    for id in auto
                ]
                if self.auto_convert_question(
                        _('Auto convert the following books to %s before sending via '
                          'email?') % format.upper(), autos):
                    self.iactions['Convert Books'].auto_convert_mail(
                        to, fmts, delete_from_library, auto, format, subject)

        if bad:
            bad = '\n'.join('%s' % (i, ) for i in bad)
            d = warning_dialog(
                self, _('No suitable formats'),
                _('Could not email the following books '
                  'as no suitable formats were found:'), bad)
            d.exec_()

예제 #25

0

파일 보기

파일: fb2.py 프로젝트: SAB2012/calibre

def _set_comments(title_info, mi, ctx):
    if not mi.is_null('comments'):
        from calibre.utils.html2text import html2text
        ctx.clear_meta_tags(title_info, 'annotation')
        title = ctx.get_or_create(title_info, 'annotation')
        ctx.text2fb2(title, html2text(mi.comments))

예제 #26

0

파일 보기

def identify(log, abort,  # {{{
        title=None, authors=None, identifiers={}, timeout=30, allowed_plugins=None):
    if title == _('Unknown'):
        title = None
    if authors == [_('Unknown')]:
        authors = None
    start_time = time.time()

    plugins = [p for p in metadata_plugins(['identify'])
        if p.is_configured() and (allowed_plugins is None or p.name in allowed_plugins)]

    kwargs = {
        'title': title,
        'authors': authors,
        'identifiers': identifiers,
        'timeout': timeout,
    }

    log('Running identify query with parameters:')
    log(kwargs)
    log('Using plugins:', ', '.join(['%s %s' % (p.name, p.version) for p in plugins]))
    log('The log from individual plugins is below')

    workers = [Worker(p, kwargs, abort) for p in plugins]
    for w in workers:
        w.start()

    first_result_at = None
    results = {}
    for p in plugins:
        results[p] = []
    logs = dict([(w.plugin, w.buf) for w in workers])

    def get_results():
        found = False
        for w in workers:
            try:
                result = w.rq.get_nowait()
            except Empty:
                pass
            else:
                results[w.plugin].append(result)
                found = True
        return found

    wait_time = msprefs['wait_after_first_identify_result']
    while True:
        time.sleep(0.2)

        if get_results() and first_result_at is None:
            first_result_at = time.time()

        if not is_worker_alive(workers):
            break

        if (first_result_at is not None and time.time() - first_result_at > wait_time):
            log.warn('Not waiting any longer for more results. Still running'
                    ' sources:')
            for worker in workers:
                if worker.is_alive():
                    log.debug('\t' + worker.name)
            abort.set()
            break

    while not abort.is_set() and get_results():
        pass

    sort_kwargs = dict(kwargs)
    for k in list(sort_kwargs.iterkeys()):
        if k not in ('title', 'authors', 'identifiers'):
            sort_kwargs.pop(k)

    longest, lp = -1, ''
    for plugin, presults in results.iteritems():
        presults.sort(key=plugin.identify_results_keygen(**sort_kwargs))

        # Throw away lower priority results from the same source that have exactly the same
        # title and authors as a higher priority result
        filter_results = set()
        filtered_results = []
        for r in presults:
            key = (r.title, tuple(r.authors))
            if key not in filter_results:
                filtered_results.append(r)
                filter_results.add(key)
        results[plugin] = presults = filtered_results

        plog = logs[plugin].getvalue().strip()
        log('\n'+'*'*30, plugin.name, '%s' % (plugin.version,), '*'*30)
        log('Found %d results'%len(presults))
        time_spent = getattr(plugin, 'dl_time_spent', None)
        if time_spent is None:
            log('Downloading was aborted')
            longest, lp = -1, plugin.name
        else:
            log('Downloading from', plugin.name, 'took', time_spent)
            if time_spent > longest:
                longest, lp = time_spent, plugin.name
        for r in presults:
            log('\n\n---')
            try:
                log(unicode(r))
            except TypeError:
                log(repr(r))
        if plog:
            log(plog)
        log('\n'+'*'*80)

        dummy = Metadata(_('Unknown'))
        for i, result in enumerate(presults):
            for f in plugin.prefs['ignore_fields']:
                if ':' not in f:
                    setattr(result, f, getattr(dummy, f))
                if f == 'series':
                    result.series_index = dummy.series_index
            result.relevance_in_source = i
            result.has_cached_cover_url = (
                plugin.cached_cover_url_is_reliable and plugin.get_cached_cover_url(result.identifiers) is not None)
            result.identify_plugin = plugin
            if msprefs['txt_comments']:
                if plugin.has_html_comments and result.comments:
                    result.comments = html2text(result.comments)

    log('The identify phase took %.2f seconds'%(time.time() - start_time))
    log('The longest time (%f) was taken by:'%longest, lp)
    log('Merging results from different sources and finding earliest ',
            'publication dates from the worldcat.org service')
    start_time = time.time()
    results = merge_identify_results(results, log)

    log('We have %d merged results, merging took: %.2f seconds' %
            (len(results), time.time() - start_time))
    tm_rules = msprefs['tag_map_rules']
    if tm_rules:
        from calibre.ebooks.metadata.tag_mapper import map_tags

    max_tags = msprefs['max_tags']
    for r in results:
        if tm_rules:
            r.tags = map_tags(r.tags, tm_rules)
        r.tags = r.tags[:max_tags]
        if getattr(r.pubdate, 'year', 2000) <= UNDEFINED_DATE.year:
            r.pubdate = None

    if msprefs['swap_author_names']:
        for r in results:
            def swap_to_ln_fn(a):
                if ',' in a:
                    return a
                parts = a.split(None)
                if len(parts) <= 1:
                    return a
                surname = parts[-1]
                return '%s, %s' % (surname, ' '.join(parts[:-1]))
            r.authors = [swap_to_ln_fn(a) for a in r.authors]

    return results

예제 #27

0

파일 보기

파일: csv_xml.py 프로젝트: smdx023/calibre

    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from calibre.library import current_library_name
        from calibre.utils.date import isoformat
        from calibre.utils.html2text import html2text
        from calibre.utils.logging import default_log as log
        from lxml import etree
        from calibre.ebooks.metadata import authors_to_string

        self.fmt = path_to_output.rpartition('.')[2]
        self.notification = notification
        current_library = current_library_name()
        if getattr(opts, 'library_path', None):
            current_library = os.path.basename(opts.library_path)

        if opts.verbose:
            opts_dict = vars(opts)
            log("%s('%s'): Generating %s" %
                (self.name, current_library, self.fmt.upper()))
            if opts.connected_device['is_device_connected']:
                log(" connected_device: %s" % opts.connected_device['name'])
            if opts_dict['search_text']:
                log(" --search='%s'" % opts_dict['search_text'])

            if opts_dict['ids']:
                log(" Book count: %d" % len(opts_dict['ids']))
                if opts_dict['search_text']:
                    log(" (--search ignored when a subset of the database is specified)"
                        )

            if opts_dict['fields']:
                if opts_dict['fields'] == 'all':
                    log(" Fields: %s" % ', '.join(FIELDS[1:]))
                else:
                    log(" Fields: %s" % opts_dict['fields'])

        # If a list of ids are provided, don't use search_text
        if opts.ids:
            opts.search_text = None

        data = self.search_sort_db(db, opts)

        if not len(data):
            log.error(
                "\nNo matching database entries for search criteria '%s'" %
                opts.search_text)
            # raise SystemExit(1)

        # Get the requested output fields as a list
        fields = self.get_output_fields(db, opts)

        # If connected device, add 'On Device' values to data
        if opts.connected_device[
                'is_device_connected'] and 'ondevice' in fields:
            for entry in data:
                entry['ondevice'] = db.catalog_plugin_on_device_temp_mapping[
                    entry['id']]['ondevice']

        fm = {x: db.field_metadata.get(x, {}) for x in fields}

        if self.fmt == 'csv':
            outfile = codecs.open(path_to_output, 'w', 'utf8')

            # Write a UTF-8 BOM
            outfile.write('\ufeff')

            # Output the field headers
            outfile.write('%s\n' % ','.join(fields))

            # Output the entry fields
            for entry in data:
                outstr = []
                for field in fields:
                    if field.startswith('#'):
                        item = db.get_field(entry['id'],
                                            field,
                                            index_is_id=True)
                        if isinstance(item, (list, tuple)):
                            if fm.get(field,
                                      {}).get('display',
                                              {}).get('is_names', False):
                                item = ' & '.join(item)
                            else:
                                item = ', '.join(item)
                    elif field == 'library_name':
                        item = current_library
                    elif field == 'title_sort':
                        item = entry['sort']
                    else:
                        item = entry[field]

                    if item is None:
                        outstr.append('""')
                        continue
                    elif field == 'formats':
                        fmt_list = []
                        for format in item:
                            fmt_list.append(format.rpartition('.')[2].lower())
                        item = ', '.join(fmt_list)
                    elif field == 'authors':
                        item = authors_to_string(item)
                    elif field == 'tags':
                        item = ', '.join(item)
                    elif field == 'isbn':
                        # Could be 9, 10 or 13 digits, with hyphens, possibly ending in 'X'
                        item = '%s' % re.sub(r'[^\dX-]', '', item)
                    elif fm.get(field, {}).get('datatype') == 'datetime':
                        item = isoformat(item, as_utc=False)
                    elif field == 'comments':
                        item = item.replace('\r\n', ' ')
                        item = item.replace('\n', ' ')
                    elif fm.get(field, {}).get('datatype',
                                               None) == 'rating' and item:
                        item = '%.2g' % (item / 2)

                    # Convert HTML to markdown text
                    if isinstance(item, str):
                        opening_tag = re.search(r'<(\w+)( |>)', item)
                        if opening_tag:
                            closing_tag = re.search(
                                r'<\/%s>$' % opening_tag.group(1), item)
                            if closing_tag:
                                item = html2text(item)

                    outstr.append('"%s"' % str(item).replace('"', '""'))

                outfile.write(','.join(outstr) + '\n')
            outfile.close()

        elif self.fmt == 'xml':
            from lxml.builder import E

            if getattr(opts, 'catalog_title', None):
                root = E.calibredb(title=opts.catalog_title)
            else:
                root = E.calibredb()
            for r in data:
                record = E.record()
                root.append(record)

                for field in fields:
                    if field.startswith('#'):
                        val = db.get_field(r['id'], field, index_is_id=True)
                        if not isinstance(val, str):
                            val = str(val)
                        item = getattr(E, field.replace('#', '_'))(val)
                        record.append(item)

                for field in ('id', 'uuid', 'publisher', 'rating', 'size',
                              'isbn', 'ondevice', 'identifiers'):
                    if field in fields:
                        val = r[field]
                        if not val:
                            continue
                        if not isinstance(val, (bytes, str)):
                            if (fm.get(field, {}).get('datatype', None)
                                    == 'rating' and val):
                                val = '%.2g' % (val / 2)
                            val = str(val)
                        item = getattr(E, field)(val)
                        record.append(item)

                if 'title' in fields:
                    title = E.title(r['title'], sort=r['sort'])
                    record.append(title)

                if 'authors' in fields:
                    aus = E.authors(sort=r['author_sort'])
                    for au in r['authors']:
                        aus.append(E.author(au))
                    record.append(aus)

                for field in ('timestamp', 'pubdate'):
                    if field in fields:
                        record.append(
                            getattr(E, field)(isoformat(r[field],
                                                        as_utc=False)))

                if 'tags' in fields and r['tags']:
                    tags = E.tags()
                    for tag in r['tags']:
                        tags.append(E.tag(tag))
                    record.append(tags)

                if 'comments' in fields and r['comments']:
                    record.append(E.comments(r['comments']))

                if 'series' in fields and r['series']:
                    record.append(
                        E.series(r['series'], index=str(r['series_index'])))

                if 'languages' in fields and r['languages']:
                    record.append(E.languages(r['languages']))

                if 'cover' in fields and r['cover']:
                    record.append(E.cover(r['cover'].replace(os.sep, '/')))

                if 'formats' in fields and r['formats']:
                    fmt = E.formats()
                    for f in r['formats']:
                        fmt.append(E.format(f.replace(os.sep, '/')))
                    record.append(fmt)

                if 'library_name' in fields:
                    record.append(E.library_name(current_library))

            with open(path_to_output, 'wb') as f:
                f.write(
                    etree.tostring(root,
                                   encoding='utf-8',
                                   xml_declaration=True,
                                   pretty_print=True))

예제 #28

0

파일 보기

파일: csv_xml.py 프로젝트: GaryMMugford/calibre

    def run(self, path_to_output, opts, db, notification=DummyReporter()):
        from calibre.library import current_library_name
        from calibre.utils.date import isoformat
        from calibre.utils.html2text import html2text
        from calibre.utils.logging import default_log as log
        from lxml import etree

        self.fmt = path_to_output.rpartition(".")[2]
        self.notification = notification
        current_library = current_library_name()
        if getattr(opts, "library_path", None):
            current_library = os.path.basename(opts.library_path)

        if opts.verbose:
            opts_dict = vars(opts)
            log("%s('%s'): Generating %s" % (self.name, current_library, self.fmt.upper()))
            if opts.connected_device["is_device_connected"]:
                log(" connected_device: %s" % opts.connected_device["name"])
            if opts_dict["search_text"]:
                log(" --search='%s'" % opts_dict["search_text"])

            if opts_dict["ids"]:
                log(" Book count: %d" % len(opts_dict["ids"]))
                if opts_dict["search_text"]:
                    log(" (--search ignored when a subset of the database is specified)")

            if opts_dict["fields"]:
                if opts_dict["fields"] == "all":
                    log(" Fields: %s" % ", ".join(FIELDS[1:]))
                else:
                    log(" Fields: %s" % opts_dict["fields"])

        # If a list of ids are provided, don't use search_text
        if opts.ids:
            opts.search_text = None

        data = self.search_sort_db(db, opts)

        if not len(data):
            log.error("\nNo matching database entries for search criteria '%s'" % opts.search_text)
            # raise SystemExit(1)

        # Get the requested output fields as a list
        fields = self.get_output_fields(db, opts)

        # If connected device, add 'On Device' values to data
        if opts.connected_device["is_device_connected"] and "ondevice" in fields:
            for entry in data:
                entry["ondevice"] = db.catalog_plugin_on_device_temp_mapping[entry["id"]]["ondevice"]

        fm = {x: db.field_metadata.get(x, {}) for x in fields}

        if self.fmt == "csv":
            outfile = codecs.open(path_to_output, "w", "utf8")

            # Write a UTF-8 BOM
            outfile.write("\xef\xbb\xbf")

            # Output the field headers
            outfile.write(u"%s\n" % u",".join(fields))

            # Output the entry fields
            for entry in data:
                outstr = []
                for field in fields:
                    if field.startswith("#"):
                        item = db.get_field(entry["id"], field, index_is_id=True)
                    elif field == "library_name":
                        item = current_library
                    elif field == "title_sort":
                        item = entry["sort"]
                    else:
                        item = entry[field]

                    if item is None:
                        outstr.append('""')
                        continue
                    elif field == "formats":
                        fmt_list = []
                        for format in item:
                            fmt_list.append(format.rpartition(".")[2].lower())
                        item = ", ".join(fmt_list)
                    elif field in ["authors", "tags"]:
                        item = ", ".join(item)
                    elif field == "isbn":
                        # Could be 9, 10 or 13 digits, with hyphens, possibly ending in 'X'
                        item = u"%s" % re.sub(r"[^\dX-]", "", item)
                    elif field in ["pubdate", "timestamp"]:
                        item = isoformat(item, as_utc=False)
                    elif field == "comments":
                        item = item.replace(u"\r\n", u" ")
                        item = item.replace(u"\n", u" ")
                    elif fm.get(field, {}).get("datatype", None) == "rating" and item:
                        item = u"%.2g" % (item / 2.0)

                    # Convert HTML to markdown text
                    if type(item) is unicode:
                        opening_tag = re.search("<(\w+)(\x20|>)", item)
                        if opening_tag:
                            closing_tag = re.search("<\/%s>$" % opening_tag.group(1), item)
                            if closing_tag:
                                item = html2text(item)

                    outstr.append(u'"%s"' % unicode(item).replace('"', '""'))

                outfile.write(u",".join(outstr) + u"\n")
            outfile.close()

        elif self.fmt == "xml":
            from lxml.builder import E

            root = E.calibredb()
            for r in data:
                record = E.record()
                root.append(record)

                for field in fields:
                    if field.startswith("#"):
                        val = db.get_field(r["id"], field, index_is_id=True)
                        if not isinstance(val, (str, unicode)):
                            val = unicode(val)
                        item = getattr(E, field.replace("#", "_"))(val)
                        record.append(item)

                for field in ("id", "uuid", "publisher", "rating", "size", "isbn", "ondevice", "identifiers"):
                    if field in fields:
                        val = r[field]
                        if not val:
                            continue
                        if not isinstance(val, (str, unicode)):
                            if fm.get(field, {}).get("datatype", None) == "rating" and val:
                                val = u"%.2g" % (val / 2.0)
                            val = unicode(val)
                        item = getattr(E, field)(val)
                        record.append(item)

                if "title" in fields:
                    title = E.title(r["title"], sort=r["sort"])
                    record.append(title)

                if "authors" in fields:
                    aus = E.authors(sort=r["author_sort"])
                    for au in r["authors"]:
                        aus.append(E.author(au))
                    record.append(aus)

                for field in ("timestamp", "pubdate"):
                    if field in fields:
                        record.append(getattr(E, field)(isoformat(r[field], as_utc=False)))

                if "tags" in fields and r["tags"]:
                    tags = E.tags()
                    for tag in r["tags"]:
                        tags.append(E.tag(tag))
                    record.append(tags)

                if "comments" in fields and r["comments"]:
                    record.append(E.comments(r["comments"]))

                if "series" in fields and r["series"]:
                    record.append(E.series(r["series"], index=str(r["series_index"])))

                if "cover" in fields and r["cover"]:
                    record.append(E.cover(r["cover"].replace(os.sep, "/")))

                if "formats" in fields and r["formats"]:
                    fmt = E.formats()
                    for f in r["formats"]:
                        fmt.append(E.format(f.replace(os.sep, "/")))
                    record.append(fmt)

                if "library_name" in fields:
                    record.append(E.library_name(current_library))

            with open(path_to_output, "w") as f:
                f.write(etree.tostring(root, encoding="utf-8", xml_declaration=True, pretty_print=True))