Beispiel #1
0
 def evaluate(self, formatter, kwargs, mi, locals, date1, date2):
     try:
         d1 = parse_date(date1)
         if d1 == UNDEFINED_DATE:
             return ""
         d2 = parse_date(date2)
         if d2 == UNDEFINED_DATE:
             return ""
     except:
         return ""
     i = d1 - d2
     return str("%d.%d" % (i.days, i.seconds / 8640))
Beispiel #2
0
def save_serialized_to_disk(ids, data, plugboards, root, opts, callback):
    from calibre.ebooks.metadata.opf2 import OPF
    root, opts, length = _sanitize_args(root, opts)
    failures = []
    for x in ids:
        opf, cover, format_map, last_modified = data[x]
        if isinstance(opf, unicode):
            opf = opf.encode('utf-8')
        mi = OPF(cStringIO.StringIO(opf)).to_book_metadata()
        try:
            mi.last_modified = parse_date(last_modified)
        except:
            pass
        tb = ''
        try:
            with open(cover, 'rb') as f:
                cover = f.read()
        except:
            cover = None
        try:
            failed, id, title = do_save_book_to_disk(x, mi, cover,
                plugboards, format_map, root, opts, length)
            tb = _('Requested formats not available')
        except:
            failed, id, title = True, x, mi.title
            tb = traceback.format_exc()
        if failed:
            failures.append((id, title, tb))
        if callable(callback):
            if not callback(int(id), title, failed, tb):
                break

    return failures
Beispiel #3
0
    def fix_pubdates(self):
        from calibre.utils.date import parse_date, strptime

        dirtied = False
        opf = self.container.opf
        for dcdate in opf.xpath('//dc:date',
                namespaces={'dc':'http://purl.org/dc/elements/1.1/'}):
            raw = dcdate.text
            if not raw: raw = ''
            default = strptime('2000-1-1', '%Y-%m-%d', as_utc=True)
            try:
                ts = parse_date(raw, assume_utc=False, as_utc=True,
                        default=default)
            except:
                raise InvalidEpub('Invalid date set in OPF', raw)
            try:
                sval = ts.strftime('%Y-%m-%d')
            except:
                from calibre import strftime
                sval = strftime('%Y-%m-%d', ts.timetuple())
            if sval != raw:
                self.log.error(
                    'OPF contains date', raw, 'that epubcheck does not like')
                if self.fix:
                    dcdate.text = sval
                    self.log('\tReplaced', raw, 'with', sval)
                    dirtied = True
        if dirtied:
            self.container.set(self.container.opf_name, opf)
Beispiel #4
0
def consolidate_metadata(info_mi, info):
    """ When both the PDF Info dict and XMP metadata are present, prefer the xmp
    metadata unless the Info ModDate is never than the XMP MetadataDate. This
    is the algorithm recommended by the PDF spec. """
    try:
        xmp_mi = metadata_from_xmp_packet(info["xmp_metadata"])
    except:
        import traceback

        traceback.print_exc()
        return info_mi
    info_title, info_authors, info_tags = (
        info_mi.title or _("Unknown"),
        list(info_mi.authors or ()),
        list(info_mi.tags or ()),
    )
    info_mi.smart_update(xmp_mi, replace_metadata=True)
    prefer_info = False
    if "ModDate" in info and hasattr(xmp_mi, "metadata_date"):
        try:
            info_date = parse_date(info["ModDate"])
        except:
            pass
        else:
            prefer_info = info_date > xmp_mi.metadata_date
    if prefer_info:
        info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags
    else:
        # We'll use the xmp tags/authors but fallback to the info ones if the
        # xmp does not have tags/authors. smart_update() should have taken care of
        # the rest
        info_mi.authors, info_mi.tags = xmp_mi.authors or info_mi.authors, xmp_mi.tags or info_mi.tags
    return info_mi
Beispiel #5
0
def _parse_timestamp(root, mi):
    #<date value="1996-12-03">03.12.1996</date>
    xp ='//fb2:document-info/fb2:date/@value|'\
        '//fb2:document-info/fb2:date/text()'
    docdate = XPath('string(%s)' % xp)(root)
    if docdate:
        mi.timestamp = parse_date(docdate)
Beispiel #6
0
def consolidate_metadata(info_mi, info):
    ''' When both the PDF Info dict and XMP metadata are present, prefer the xmp
    metadata unless the Info ModDate is never than the XMP MetadataDate. This
    is the algorithm recommended by the PDF spec. '''
    try:
        raw = info['xmp_metadata'].rstrip()
        if not raw:
            return info_mi
        xmp_mi = metadata_from_xmp_packet(raw)
    except Exception:
        import traceback
        traceback.print_exc()
        return info_mi
    info_title, info_authors, info_tags = info_mi.title or _('Unknown'), list(info_mi.authors or ()), list(info_mi.tags or ())
    info_mi.smart_update(xmp_mi, replace_metadata=True)
    prefer_info = False
    if 'ModDate' in info and hasattr(xmp_mi, 'metadata_date'):
        try:
            info_date = parse_date(info['ModDate'])
        except Exception:
            pass
        else:
            prefer_info = info_date > xmp_mi.metadata_date
    if prefer_info:
        info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags
    else:
        # We'll use the xmp tags/authors but fallback to the info ones if the
        # xmp does not have tags/authors. smart_update() should have taken care of
        # the rest
        info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags
    return info_mi
Beispiel #7
0
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, 1)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if 'title' in data:
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif 'creator' in data:
        mi.authors = string_to_authors(data['creator'])
    if 'description' in data:
        mi.comments = data['description']
    if 'language' in data:
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False  # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass  # Do not let an error reading the cover prevent reading other data

    return mi
Beispiel #8
0
def convert_kobo_date(kobo_date):
    from calibre.utils.date import utc_tz

    try:
        converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S.%f")
        converted_date = datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S")
        converted_date = converted_date.replace(tzinfo=utc_tz)
#            debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S.%f' - kobo_date={0}'".format(kobo_date))
    except:
        try:
            converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S%+00:00")
#                debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date))
        except:
            try:
                converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                converted_date = converted_date.replace(tzinfo=utc_tz)
#                    debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date))
            except:
                try:
                    converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d")
                    converted_date = converted_date.replace(tzinfo=utc_tz)
#                        debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date))
                except:
                    try:
                        from calibre.utils.date import parse_date
                        converted_date = parse_date(kobo_date, assume_utc=True)
#                            debug_print("convert_kobo_date - parse_date - kobo_date=%s' - kobo_date={0}'".format(kobo_date))
                    except:
#                        try:
#                            converted_date = time.gmtime(os.path.getctime(self.path))
#                            debug_print("convert_kobo_date - time.gmtime(os.path.getctime(self.path)) - kobo_date={0}'".format(kobo_date))
#                        except:
                        converted_date = time.gmtime()
                        debug_print("convert_kobo_date - time.gmtime() - kobo_date={0}'".format(kobo_date))
    return converted_date
Beispiel #9
0
    def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None,
                 thumbnail_name=None, size=None, other=None):
        from calibre.utils.date import parse_date
#         debug_print('Book::__init__ - title=', title)
        show_debug = title is not None and title.lower().find("xxxxx") >= 0
        if other is not None:
            other.title = title
            other.published_date = date
        if show_debug:
            debug_print("Book::__init__ - title=", title, 'authors=', authors)
            debug_print("Book::__init__ - other=", other)
        super(Book, self).__init__(prefix, lpath, size, other)

        if title is not None and len(title) > 0:
            self.title = title

        if authors is not None and len(authors) > 0:
            self.authors_from_string(authors)
            if self.author_sort is None or self.author_sort == "Unknown":
                self.author_sort = author_to_author_sort(authors)

        self.mime = mime

        self.size = size  # will be set later if None

        if ContentType == '6' and date is not None:
            try:
                self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
            except:
                try:
                    self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                except:
                    try:
                        self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%d")
                    except:
                        try:
                            self.datetime = parse_date(date,
                                    assume_utc=True).timetuple()
                        except:
                            try:
                                self.datetime = time.gmtime(os.path.getctime(self.path))
                            except:
                                self.datetime = time.gmtime()

        self.kobo_metadata = Metadata(title, self.authors)
        self.contentID          = None
        self.current_shelves    = []
        self.kobo_collections   = []
        self.can_put_on_shelves = True
        self.kobo_series        = None
        self.kobo_series_number = None  # Kobo stores the series number as string. And it can have a leading "#".
        self.kobo_subtitle      = None

        if thumbnail_name is not None:
            self.thumbnail = ImageWrapper(thumbnail_name)

        if show_debug:
            debug_print("Book::__init__ end - self=", self)
            debug_print("Book::__init__ end - title=", title, 'authors=', authors)
Beispiel #10
0
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw, strip_encoding_pats=True,
                resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]")

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {'english':'eng', 'french':'fra', 'german':'deu',
                    'spanish':'spa'}.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html', encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
Beispiel #11
0
def from_json(obj):
    if '__class__' in obj:
        if obj['__class__'] == 'bytearray':
            return bytearray(base64.standard_b64decode(obj['__value__']))
        if obj['__class__'] == 'datetime.datetime':
            from calibre.utils.date import parse_date
            return parse_date(obj['__value__'], assume_utc=True)
    return obj
Beispiel #12
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         self.mi.authors.append(au)
         if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
             self.mi.author_sort = au.strip()
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments  = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Beispiel #13
0
def string_to_datetime(src):
    from calibre.utils.date import parse_date

    if src != "None":
        try:
            return parse_date(src)
        except Exception:
            pass
    return None
Beispiel #14
0
 def evaluate(self, formatter, kwargs, mi, locals, val, format_string):
     if not val or val == "None":
         return ""
     try:
         dt = parse_date(val)
         s = format_date(dt, format_string)
     except:
         s = "BAD DATE"
     return s
Beispiel #15
0
 def get_date(self, entry, verbose):
     try:
         d = date(entry)
         if d:
             default = utcnow().replace(day=15)
             d = parse_date(d[0].text, assume_utc=True, default=default)
         else:
             d = None
     except:
         report(verbose)
         d = None
     return d
Beispiel #16
0
    def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None,
                 thumbnail_name=None, size=None, other=None):
#        debug_print('Book::__init__ - title=', title)
        show_debug = title is not None and title.lower().find("xxxxx") >= 0
        if show_debug:
            debug_print("Book::__init__ - title=", title, 'authors=', authors)
            debug_print("Book::__init__ - other=", other)
        Book_.__init__(self, prefix, lpath, size, other)

        if title is not None and len(title) > 0:
            self.title = title

        if authors is not None and len(authors) > 0:
            self.authors_from_string(authors)
            if self.author_sort is None or self.author_sort == "Unknown":
                self.author_sort = author_to_author_sort(authors)

        self.mime = mime

        self.size = size # will be set later if None

        if ContentType == '6' and date is not None:
            try:
                self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
            except:
                try:
                    self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                except:
                    try:
                        self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%d")
                    except:
                        try:
                            self.datetime = parse_date(date,
                                    assume_utc=True).timetuple()
                        except:
                            try:
                                self.datetime = time.gmtime(os.path.getctime(self.path))
                            except:
                                self.datetime = time.gmtime()

        self.contentID          = None
        self.current_shelves    = []
        self.kobo_collections   = []
        self.kobo_series        = None
        self.kobo_series_number = None

        if thumbnail_name is not None:
            self.thumbnail = ImageWrapper(thumbnail_name)

        if show_debug:
            debug_print("Book::__init__ end - self=", self)
            debug_print("Book::__init__ end - title=", title, 'authors=', authors)
Beispiel #17
0
    def data2mi(self, item):
        """Converts a single metadata answer in the form of a dict to a MetadataInformation object"""

        mi = Metadata(_('Unknown'))

        # Regular metadata
        mi.title = item.get('title', None)
        mi.authors = item.get('authors', [])
        mi.publisher = item.get('publisher', None)

        if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id'])
        if 'doi' in item.keys(): mi.set_identifier('doi', item['doi'])
        if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn'])

        if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True)

        if 'series' in item.keys():
            mi.series = item['series']
            mi.series_index = self.format_series_index(item.get('series_index'), None)

        if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True)

        if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract'])

        if 'language' in item.keys(): mi.language = item['language']

        if 'journal' in item.keys():
            mi.series = item['journal']
            mi.series_index = self.format_series_index(item.get('volume'), item.get('number'))

        if 'subject' in item.keys():
            tags = set([])
            for s in item['subject']:
                tags.update(msc_tags(s))
                tags.update(arxiv_tags(s))

            mi.tags = list(sorted(tags))

        return mi
Beispiel #18
0
 def un_serialize_schedule(self, recipe):
     for x in recipe.iterdescendants():
         if 'schedule' in x.tag:
             sch, typ = x.text, x.get('type')
             if typ == 'interval':
                 sch = float(sch)
             elif typ == 'day/time':
                 sch = list(map(int, sch.split(':')))
             elif typ in ('days_of_week', 'days_of_month'):
                 parts = sch.split(':')
                 days = list(map(int, [x.strip() for x in
                     parts[0].split(',')]))
                 sch = [days, int(parts[1]), int(parts[2])]
             return typ, sch, parse_date(recipe.get('last_downloaded'))
Beispiel #19
0
def read_serialized_metadata(book_id, data):
    from calibre.ebooks.metadata.opf2 import OPF
    from calibre.utils.date import parse_date
    mi = OPF(data['opf'], try_to_guess_cover=False, populate_spine=False, basedir=os.path.dirname(data['opf'])).to_book_metadata()
    try:
        mi.last_modified = parse_date(data['last_modified'])
    except:
        pass
    mi.cover, mi.cover_data = None, (None, None)
    cdata = None
    if 'cover' in data:
        with lopen(data['cover'], 'rb') as f:
            cdata = f.read()
    return mi, cdata
Beispiel #20
0
def do_set_metadata(opts, mi, stream, stream_type):
    mi = MetaInformation(mi)
    for x in ('guide', 'toc', 'manifest', 'spine'):
        setattr(mi, x, None)

    from_opf = getattr(opts, 'from_opf', None)
    if from_opf is not None:
        from calibre.ebooks.metadata.opf2 import OPF
        opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata()
        mi.smart_update(opf_mi)

    for pref in config().option_set.preferences:
        if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort',
                         'author_sort', 'get_cover', 'cover', 'tags',
                         'lrf_bookid', 'identifiers'):
            continue
        val = getattr(opts, pref.name, None)
        if val is not None:
            setattr(mi, pref.name, val)
    if getattr(opts, 'authors', None) is not None:
        mi.authors = string_to_authors(opts.authors)
        mi.author_sort = authors_to_sort_string(mi.authors)
    if getattr(opts, 'author_sort', None) is not None:
        mi.author_sort = opts.author_sort
    if getattr(opts, 'title_sort', None) is not None:
        mi.title_sort = opts.title_sort
    elif getattr(opts, 'title', None) is not None:
        mi.title_sort = title_sort(opts.title)
    if getattr(opts, 'tags', None) is not None:
        mi.tags = [t.strip() for t in opts.tags.split(',')]
    if getattr(opts, 'series', None) is not None:
        mi.series = opts.series.strip()
    if getattr(opts, 'series_index', None) is not None:
        mi.series_index = float(opts.series_index.strip())
    if getattr(opts, 'pubdate', None) is not None:
        mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False)
    if getattr(opts, 'identifiers', None):
        val = {k.strip():v.strip() for k, v in (x.partition(':')[0::2] for x in opts.identifiers)}
        if val:
            orig = mi.get_identifiers()
            orig.update(val)
            val = {k:v for k, v in iteritems(orig) if k and v}
            mi.set_identifiers(val)

    if getattr(opts, 'cover', None) is not None:
        ext = os.path.splitext(opts.cover)[1].replace('.', '').upper()
        mi.cover_data = (ext, open(opts.cover, 'rb').read())

    with force_identifiers:
        set_metadata(stream, mi, stream_type)
Beispiel #21
0
def _c_convert_timestamp(val):
    if not val:
        return None
    try:
        ret = _c_speedup.parse_date(val.strip())
    except:
        ret = None
    if ret is None:
        return parse_date(val, as_utc=False)
    year, month, day, hour, minutes, seconds, tzsecs = ret
    try:
        return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz)
    except OverflowError:
        return UNDEFINED_DATE.astimezone(local_tz)
Beispiel #22
0
    def test_datetime(self):  # {{{
        ' Test the reading of datetimes stored in the db '
        from calibre.utils.date import parse_date
        from calibre.db.tables import c_parse, UNDEFINED_DATE, _c_speedup

        # First test parsing of string to UTC time
        for raw in ('2013-07-22 15:18:29+05:30', '  2013-07-22 15:18:29+00:00', '2013-07-22 15:18:29', '2003-09-21 23:30:00-06:00'):
            self.assertTrue(_c_speedup(raw))
            ctime = c_parse(raw)
            pytime = parse_date(raw, assume_utc=True)
            self.assertEqual(ctime, pytime)

        self.assertEqual(c_parse(2003).year, 2003)
        for x in (None, '', 'abc'):
            self.assertEqual(UNDEFINED_DATE, c_parse(x))
Beispiel #23
0
def _py_convert_timestamp(val):
    if val:
        tzsecs = 0
        try:
            sign = {'+':1, '-':-1}.get(val[-6], None)
            if sign is not None:
                tzsecs = 60*((int(val[-5:-3])*60 + int(val[-2:])) * sign)
            year = int(val[0:4])
            month = int(val[5:7])
            day = int(val[8:10])
            hour = int(val[11:13])
            min = int(val[14:16])
            sec = int(val[17:19])
            return datetime(year, month, day, hour, min, sec,
                    tzinfo=tzoffset(None, tzsecs))
        except:
            pass
        return parse_date(val, as_utc=False)
    return None
def convert_kobo_date(kobo_date):
    """
    KoBo stores dates as a timestamp string. The exact format has changed with firmware
    and what part of the firmware writes it. The following is overkill, but it handles all 
    the formats I have seen.
    """
    from calibre.utils.date import utc_tz, local_tz
    from calibre.devices.usbms.driver import debug_print
#     debug_print("convert_kobo_date - start - kobo_date={0}'".format(kobo_date))

    try:
        converted_date = datetime.datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S+00:00")
#         debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date))
    except Exception as e:
#         debug_print("convert_kobo_date - exception={0}'".format(e))
        try:
            converted_date = datetime.datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%SZ")
#             debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%SZ' - kobo_date={0}'".format(kobo_date))
        except:
            try:
                converted_date = datetime.datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S")
#                 debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date))
            except:
                try:
                    converted_date = datetime.datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
#                     debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date))
                except:
                    try:
                        converted_date = datetime.datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d")
    #                     converted_date = converted_date.replace(tzinfo=utc_tz)
#                         debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date))
                    except:
                        try:
                            from calibre.utils.date import parse_date
                            converted_date = parse_date(kobo_date)#, assume_utc=True)
#                             debug_print("convert_kobo_date - parse_date - kobo_date={0}'".format(kobo_date))
                        except:
                            converted_date = time.gmtime()
                            debug_print("convert_kobo_date - could not convert, using current time - kobo_date={0}'".format(kobo_date))
    
    converted_date = converted_date.replace(tzinfo=utc_tz).astimezone(local_tz)
    return converted_date
Beispiel #25
0
 def schedule_recipe(self, recipe, schedule_type, schedule, last_downloaded=None):
     with self.lock:
         for x in list(self.iter_recipes()):
             if x.get('id', False) == recipe.get('id'):
                 ld = x.get('last_downloaded', None)
                 if ld and last_downloaded is None:
                     try:
                         last_downloaded = parse_date(ld)
                     except:
                         pass
                 self.root.remove(x)
                 break
         if last_downloaded is None:
             last_downloaded = fromordinal(1)
         sr = E.scheduled_recipe({
             'id' : recipe.get('id'),
             'title': recipe.get('title'),
             'last_downloaded':isoformat(last_downloaded),
             }, self.serialize_schedule(schedule_type, schedule))
         self.root.append(sr)
         self.write_scheduler_file()
Beispiel #26
0
    def __init__(self, devs, blacklist):
        QWidget.__init__(self)
        self.l = l = QVBoxLayout()
        self.setLayout(l)
        self.la = la = QLabel('<p>'+_(
            '''Select the devices to be <b>ignored</b>. calibre <b>will not</b>
            connect to devices with a checkmark next to their names.'''))
        la.setWordWrap(True)
        l.addWidget(la)
        self.f = f = QListWidget(self)
        l.addWidget(f)

        devs = [(snum, (x[0], parse_date(x[1]))) for snum, x in
                devs.iteritems()]
        for dev, x in sorted(devs, key=lambda x:x[1][1], reverse=True):
            name = x[0]
            name = '%s [%s]'%(name, dev)
            item = QListWidgetItem(name, f)
            item.setData(Qt.UserRole, dev)
            item.setFlags(Qt.ItemIsEnabled|Qt.ItemIsUserCheckable|Qt.ItemIsSelectable)
            item.setCheckState(Qt.Checked if dev in blacklist else Qt.Unchecked)
def convert_kobo_date(kobo_date):
    """
    KoBo stores dates as a timestamp string. The exact format has changed with firmware
    and what part of the firmware writes it. The following is overkill, but it handles all 
    the formats I have seen.
    """
    from calibre.utils.date import utc_tz

    try:
        converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S.%f")
        converted_date = datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S")
        converted_date = converted_date.replace(tzinfo=utc_tz)
#            debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S.%f' - kobo_date={0}'".format(kobo_date))
    except:
        try:
            converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S%+00:00")
#                debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date))
        except:
            try:
                converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                converted_date = converted_date.replace(tzinfo=utc_tz)
#                    debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date))
            except:
                try:
                    converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d")
                    converted_date = converted_date.replace(tzinfo=utc_tz)
#                        debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date))
                except:
                    try:
                        from calibre.utils.date import parse_date
                        converted_date = parse_date(kobo_date, assume_utc=True)
#                            debug_print("convert_kobo_date - parse_date - kobo_date=%s' - kobo_date={0}'".format(kobo_date))
                    except:
#                        try:
#                            converted_date = time.gmtime(os.path.getctime(self.path))
#                            debug_print("convert_kobo_date - time.gmtime(os.path.getctime(self.path)) - kobo_date={0}'".format(kobo_date))
#                        except:
                        converted_date = time.gmtime()
                        debug_print("convert_kobo_date - time.gmtime() - kobo_date={0}'".format(kobo_date))
    return converted_date
Beispiel #28
0
def c_parse(val):
    try:
        year, month, day, hour, minutes, seconds, tzsecs = _c_speedup(val)
    except (AttributeError, TypeError):
        # If a value like 2001 is stored in the column, apsw will return it as
        # an int
        if isinstance(val, (int, float)):
            return datetime(int(val), 1, 3, tzinfo=utc_tz)
    except:
        pass
    else:
        try:
            ans = datetime(year, month, day, hour, minutes, seconds, tzinfo=utc_tz)
            if tzsecs is not 0:
                ans -= timedelta(seconds=tzsecs)
        except OverflowError:
            ans = UNDEFINED_DATE
        return ans
    try:
        return parse_date(val, as_utc=True, assume_utc=True)
    except ValueError:
        return UNDEFINED_DATE
Beispiel #29
0
 def opts_to_mi(self, mi):
     from calibre.ebooks.metadata import string_to_authors
     for x in self.metadata_option_names:
         val = getattr(self.opts, x, None)
         if val is not None:
             if x == 'authors':
                 val = string_to_authors(val)
             elif x == 'tags':
                 val = [i.strip() for i in val.split(',')]
             elif x in ('rating', 'series_index'):
                 try:
                     val = float(val)
                 except ValueError:
                     self.log.warn(_('Values of series index and rating must'
                     ' be numbers. Ignoring'), val)
                     continue
             elif x in ('timestamp', 'pubdate'):
                 try:
                     val = parse_date(val, assume_utc=x=='pubdate')
                 except:
                     self.log.exception(_('Failed to parse date/time') + ' ' +
                             unicode(val))
                     continue
             setattr(mi, x, val)
Beispiel #30
0
def string_to_datetime(src):
    from calibre.utils.date import parse_date
    if src == "None":
        return None
    return parse_date(src)
Beispiel #31
0
    def parse(self, raw, desc_raw):
        from calibre.ebooks.metadata.book.base import Metadata
        from calibre.utils.date import parse_date, utcnow
        import json

        root = parse_html(raw.decode('gb18030'))
        title = root.xpath('//*[@id="name"]/div[1]/text()')
        title = title[0].strip()
        authors = []
        for i in root.xpath('//*[@id="p-author"]/a'):
            authors.append(i.text.strip())
        mi = Metadata(title, authors)

        information = root.xpath('//*[@id="parameter2"]/li')
        info = dict()
        for i in information:
            tmp = etree.tostring(i, method='text',
                                 encoding='utf-8').split(u':')
            info[tmp[0].strip()] = tmp[1].strip()
        # Identifiers
        mi.identifiers = self.plugin.identifiers
        mi.identifiers['jd'] = self.sku
        isbn = info['ISBN']
        self.log.error(isbn)
        if isbn:
            mi.isbn = isbn
            self.plugin.cache_isbn_to_identifier(isbn, self.sku)
            mi.identifiers['isbn'] = isbn

        # Publisher
        mi.publisher = info.get(u'出版社')

        # Pubdate
        pubdate = info.get(u'出版时间')
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                self.log.error('Failed to parse pubdate %r' % pubdate)

        # Series
        mi.series = info.get(u'丛书名')

        img = root.xpath('//*[@id="spec-n1"]/img')
        cover = img[0].get('src')
        if cover:
            if not cover.startswith('http'):
                cover = 'https:' + cover
            self.plugin.cache_identifier_to_cover_url(self.sku, cover)
        self.log.error(cover)

        mi.has_cover = self.plugin.cached_identifier_to_cover_url(
            self.sku) is not None

        # Comments
        # showdesc({"date":1583588455348,"content":" ... "})
        try:
            desc = json.loads(desc_raw[9:-1].decode('gb18030'))
            desc_root = parse_html(desc['content'])
            div = desc_root.xpath(
                '//*[@id="detail-tag-id-3"]/div[2]/div/text()')

            comments = div[0]
            mi.comments = comments
        finally:
            return mi
Beispiel #32
0
    def get_dates_matches(self, location, query, candidates):
        matches = set([])
        if len(query) < 2:
            return matches

        if location == 'date':
            location = 'timestamp'
        loc = self.field_metadata[location]['rec_index']

        if query == 'false':
            for id_ in candidates:
                item = self._data[id_]
                if item is None:
                    continue
                v = item[loc]
                if isinstance(v, (bytes, unicode_type)):
                    v = parse_date(v)
                if v is None or v <= UNDEFINED_DATE:
                    matches.add(item[0])
            return matches
        if query == 'true':
            for id_ in candidates:
                item = self._data[id_]
                if item is None:
                    continue
                v = item[loc]
                if isinstance(v, (bytes, unicode_type)):
                    v = parse_date(v)
                if v is not None and v > UNDEFINED_DATE:
                    matches.add(item[0])
            return matches

        relop = None
        for k in self.date_search_relops.keys():
            if query.startswith(k):
                (p, relop) = self.date_search_relops[k]
                query = query[p:]
        if relop is None:
            (p, relop) = self.date_search_relops['=']

        if query in self.local_today:
            qd = now()
            field_count = 3
        elif query in self.local_yesterday:
            qd = now() - timedelta(1)
            field_count = 3
        elif query in self.local_thismonth:
            qd = now()
            field_count = 2
        elif query.endswith(self.local_daysago) or query.endswith(
                self.untrans_daysago):
            num = query[0:-(
                self.local_daysago_len if query.
                endswith(self.local_daysago) else self.untrans_daysago_len)]
            try:
                qd = now() - timedelta(int(num))
            except:
                raise ParseException(
                    _('Number conversion error: {0}').format(num))
            field_count = 3
        else:
            try:
                qd = parse_date(query, as_utc=False)
            except:
                raise ParseException(
                    _('Date conversion error: {0}').format(query))
            if '-' in query:
                field_count = query.count('-') + 1
            else:
                field_count = query.count('/') + 1
        for id_ in candidates:
            item = self._data[id_]
            if item is None or item[loc] is None:
                continue
            v = item[loc]
            if isinstance(v, (bytes, unicode_type)):
                v = parse_date(v)
            if relop(v, qd, field_count):
                matches.add(item[0])
        return matches
Beispiel #33
0
def adapt_datetime(x):
    if isinstance(x, (unicode, bytes)):
        x = parse_date(x, assume_utc=False, as_utc=False)
    if x and is_date_undefined(x):
        x = UNDEFINED_DATE
    return x
Beispiel #34
0
def metadata_from_filename(name, pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:
                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)
                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_date
                mi.pubdate = parse_date(pubdate)
        except:
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Beispiel #35
0
    def __init__(self,
                 prefix,
                 lpath,
                 title=None,
                 authors=None,
                 mime=None,
                 date=None,
                 ContentType=None,
                 thumbnail_name=None,
                 size=None,
                 other=None):
        from calibre.utils.date import parse_date
        #         debug_print('Book::__init__ - title=', title)
        show_debug = title is not None and title.lower().find("xxxxx") >= 0
        if other is not None:
            other.title = title
            other.published_date = date
        if show_debug:
            debug_print("Book::__init__ - title=", title, 'authors=', authors)
            debug_print("Book::__init__ - other=", other)
        super(Book, self).__init__(prefix, lpath, size, other)

        if title is not None and len(title) > 0:
            self.title = title

        if authors is not None and len(authors) > 0:
            self.authors_from_string(authors)
            if self.author_sort is None or self.author_sort == "Unknown":
                self.author_sort = author_to_author_sort(authors)

        self.mime = mime

        self.size = size  # will be set later if None

        if ContentType == '6' and date is not None:
            try:
                self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
            except:
                try:
                    self.datetime = time.strptime(
                        date.split('+')[0], "%Y-%m-%dT%H:%M:%S")
                except:
                    try:
                        self.datetime = time.strptime(
                            date.split('+')[0], "%Y-%m-%d")
                    except:
                        try:
                            self.datetime = parse_date(
                                date, assume_utc=True).timetuple()
                        except:
                            try:
                                self.datetime = time.gmtime(
                                    os.path.getctime(self.path))
                            except:
                                self.datetime = time.gmtime()

        self.kobo_metadata = Metadata(title, self.authors)
        self.contentID = None
        self.current_shelves = []
        self.kobo_collections = []
        self.can_put_on_shelves = True
        self.kobo_series = None
        self.kobo_series_number = None  # Kobo stores the series number as string. And it can have a leading "#".
        self.kobo_subtitle = None

        if thumbnail_name is not None:
            self.thumbnail = ImageWrapper(thumbnail_name)

        if show_debug:
            debug_print("Book::__init__ end - self=", self)
            debug_print("Book::__init__ end - title=", title, 'authors=',
                        authors)
Beispiel #36
0
def safe_parse_date(raw):
    if raw:
        try:
            return parse_date(raw)
        except Exception:
            pass
    def accept(self):
        col = unicode_type(self.column_name_box.text()).strip()
        if not col:
            return self.simple_error('', _('No lookup name was provided'))
        if col.startswith('#'):
            col = col[1:]
        if re.match(r'^\w*$',
                    col) is None or not col[0].isalpha() or col.lower() != col:
            return self.simple_error(
                '',
                _('The lookup name must contain only '
                  'lower case letters, digits and underscores, and start with a letter'
                  ))
        if col.endswith('_index'):
            return self.simple_error(
                '',
                _('Lookup names cannot end with _index, '
                  'because these names are reserved for the index of a series column.'
                  ))
        col_heading = unicode_type(self.column_heading_box.text()).strip()
        coldef = self.column_types[self.column_type_box.currentIndex()]
        col_type = coldef['datatype']
        if col_type[0] == '*':
            col_type = col_type[1:]
            is_multiple = True
        else:
            is_multiple = False
        if not col_heading:
            return self.simple_error('', _('No column heading was provided'))

        db = self.parent.gui.library_view.model().db
        key = db.field_metadata.custom_field_prefix + col
        bad_col = False
        if key in self.parent.custcols:
            if not self.editing_col or \
                    self.parent.custcols[key]['colnum'] != self.orig_column_number:
                bad_col = True
        if bad_col:
            return self.simple_error(
                '',
                _('The lookup name %s is already used') % col)

        bad_head = False
        for t in self.parent.custcols:
            if self.parent.custcols[t]['name'] == col_heading:
                if not self.editing_col or \
                        self.parent.custcols[t]['colnum'] != self.orig_column_number:
                    bad_head = True
        for t in self.standard_colheads:
            if self.standard_colheads[t] == col_heading:
                bad_head = True
        if bad_head:
            return self.simple_error(
                '',
                _('The heading %s is already used') % col_heading)

        display_dict = {}

        default_val = (unicode_type(self.default_value.text()).strip()
                       if col_type != 'composite' else None)

        if col_type == 'datetime':
            if unicode_type(self.format_box.text()).strip():
                display_dict = {
                    'date_format':
                    unicode_type(self.format_box.text()).strip()
                }
            else:
                display_dict = {'date_format': None}
            if default_val:
                if default_val == _('Now'):
                    display_dict['default_value'] = 'now'
                else:
                    try:
                        tv = parse_date(default_val)
                    except:
                        tv = UNDEFINED_DATE
                    if tv == UNDEFINED_DATE:
                        return self.simple_error(
                            _('Invalid default value'),
                            _('The default value must be "Now" or a date'))
                    display_dict['default_value'] = default_val
        elif col_type == 'composite':
            if not unicode_type(self.composite_box.text()).strip():
                return self.simple_error(
                    '', _('You must enter a template for '
                          'composite columns'))
            display_dict = {
                'composite_template':
                unicode_type(self.composite_box.text()).strip(),
                'composite_sort':
                ['text', 'number', 'date',
                 'bool'][self.composite_sort_by.currentIndex()],
                'make_category':
                self.composite_make_category.isChecked(),
                'contains_html':
                self.composite_contains_html.isChecked(),
            }
        elif col_type == 'enumeration':
            if not unicode_type(self.enum_box.text()).strip():
                return self.simple_error(
                    '',
                    _('You must enter at least one '
                      'value for enumeration columns'))
            l = [
                v.strip()
                for v in unicode_type(self.enum_box.text()).split(',')
                if v.strip()
            ]
            l_lower = [v.lower() for v in l]
            for i, v in enumerate(l_lower):
                if v in l_lower[i + 1:]:
                    return self.simple_error(
                        '',
                        _('The value "{0}" is in the '
                          'list more than once, perhaps with different case').
                        format(l[i]))
            c = unicode_type(self.enum_colors.text())
            if c:
                c = [
                    v.strip()
                    for v in unicode_type(self.enum_colors.text()).split(',')
                ]
            else:
                c = []
            if len(c) != 0 and len(c) != len(l):
                return self.simple_error(
                    '',
                    _('The colors box must be empty or '
                      'contain the same number of items as the value box'))
            for tc in c:
                if tc not in QColor.colorNames() and not re.match(
                        "#(?:[0-9a-f]{3}){1,4}", tc, re.I):
                    return self.simple_error(
                        '',
                        _('The color {0} is unknown').format(tc))
            display_dict = {'enum_values': l, 'enum_colors': c}
            if default_val:
                if default_val not in l:
                    return self.simple_error(
                        _('Invalid default value'),
                        _('The default value must be one of the permitted values'
                          ))
                display_dict['default_value'] = default_val
        elif col_type == 'text' and is_multiple:
            display_dict = {'is_names': self.is_names.isChecked()}
        elif col_type in ['int', 'float']:
            if unicode_type(self.format_box.text()).strip():
                display_dict = {
                    'number_format':
                    unicode_type(self.format_box.text()).strip()
                }
            else:
                display_dict = {'number_format': None}
            if default_val:
                try:
                    if col_type == 'int':
                        msg = _('The default value must be an integer')
                        tv = int(default_val)
                        display_dict['default_value'] = tv
                    else:
                        msg = _('The default value must be a real number')
                        tv = float(default_val)
                        display_dict['default_value'] = tv
                except:
                    return self.simple_error(_('Invalid default value'), msg)
        elif col_type == 'comments':
            display_dict['heading_position'] = unicode_type(
                self.comments_heading_position.currentData())
            display_dict['interpret_as'] = unicode_type(
                self.comments_type.currentData())
        elif col_type == 'rating':
            half_stars = bool(self.allow_half_stars.isChecked())
            display_dict['allow_half_stars'] = half_stars
            if default_val:
                try:
                    tv = int((float(default_val)
                              if half_stars else int(default_val)) * 2)
                except:
                    tv = -1
                if tv < 0 or tv > 10:
                    if half_stars:
                        return self.simple_error(
                            _('Invalid default value'),
                            _('The default value must be a real number between 0 and 5.0'
                              ))
                    else:
                        return self.simple_error(
                            _('Invalid default value'),
                            _('The default value must be an integer between 0 and 5'
                              ))
                display_dict['default_value'] = tv
        elif col_type == 'bool':
            if default_val:
                tv = {_('Yes'): True, _('No'): False}.get(default_val, None)
                if tv is None:
                    return self.simple_error(
                        _('Invalid default value'),
                        _('The default value must be "Yes" or "No"'))
                display_dict['default_value'] = tv

        if col_type in ['text', 'composite', 'enumeration'
                        ] and not is_multiple:
            display_dict['use_decorations'] = self.use_decorations.checkState()

        if default_val and 'default_value' not in display_dict:
            display_dict['default_value'] = default_val

        display_dict['description'] = self.description_box.text().strip()

        if not self.editing_col:
            self.parent.custcols[key] = {
                'label': col,
                'name': col_heading,
                'datatype': col_type,
                'display': display_dict,
                'normalized': None,
                'colnum': None,
                'is_multiple': is_multiple,
            }
            self.parent.cc_column_key = key
        else:
            self.parent.custcols[self.orig_column_name]['label'] = col
            self.parent.custcols[self.orig_column_name]['name'] = col_heading
            # Remove any previous default value
            self.parent.custcols[self.orig_column_name]['display'].pop(
                'default_value', None)
            self.parent.custcols[self.orig_column_name]['display'].update(
                display_dict)
            self.parent.custcols[self.orig_column_name]['*edited'] = True
            self.parent.custcols[self.orig_column_name]['*must_restart'] = True
            self.parent.cc_column_key = key
        QDialog.accept(self)
Beispiel #38
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src)

    def get_all(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = [x.strip() for x in ans if x.strip()]
        if not ans:
            ans = None
        return ans

    def get(field):
        ans = get_all(field)
        if ans:
            ans = ans[0]
        return ans

    # Title
    title = get('title') or title_tag.strip() or _('Unknown')

    # Author
    authors = authors_to_string(get_all('authors')) or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title, string_to_authors(authors))

    # Single-value text fields
    for field in ('publisher', 'isbn'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    # Multi-value text fields
    for field in ('languages', ):
        val = get_all(field)
        if val:
            setattr(mi, field, val)

    # HTML fields
    for field in ('comments', ):
        val = get(field)
        if val:
            setattr(
                mi, field,
                val.replace('&', '&amp;').replace('<', '&lt;').replace(
                    '>', '&gt;').replace('"', '&quot;').replace("'", '&apos;'))

    # Date fields
    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 10:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get_all('tags')
    if tags:
        tags = [x.strip() for s in tags for x in s.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    # IDENTIFIERS
    for (k, v) in iteritems(meta_tag_ids):
        v = [x.strip() for x in v if x.strip()]
        if v:
            mi.set_identifier(k, v[0])

    return mi
Beispiel #39
0
def string_to_datetime(src):
    if src == "None":
        return None
    return parse_date(src)
Beispiel #40
0
 def date_sort_key(self, val):
     try:
         val = self._filter_date(parse_date(val))
     except (TypeError, ValueError, AttributeError, KeyError):
         val = UNDEFINED_DATE
     return val
Beispiel #41
0
    def __call__(self, query, field_iter):
        matches = set()
        if len(query) < 2:
            return matches

        if query == 'false':
            for v, book_ids in field_iter():
                if isinstance(v, (str, unicode)):
                    v = parse_date(v)
                if v is None or v <= UNDEFINED_DATE:
                    matches |= book_ids
            return matches

        if query == 'true':
            for v, book_ids in field_iter():
                if isinstance(v, (str, unicode)):
                    v = parse_date(v)
                if v is not None and v > UNDEFINED_DATE:
                    matches |= book_ids
            return matches

        relop = None
        for k, op in self.operators.iteritems():
            if query.startswith(k):
                p, relop = op
                query = query[p:]
        if relop is None:
            relop = self.operators['='][-1]

        if query in self.local_today:
            qd = now()
            field_count = 3
        elif query in self.local_yesterday:
            qd = now() - timedelta(1)
            field_count = 3
        elif query in self.local_thismonth:
            qd = now()
            field_count = 2
        else:
            m = self.daysago_pat.search(query)
            if m is not None:
                num = query[:-len(m.group(1))]
                try:
                    qd = now() - timedelta(int(num))
                except:
                    raise ParseException(
                        _('Number conversion error: {0}').format(num))
                field_count = 3
            else:
                try:
                    qd = parse_date(query, as_utc=False)
                except:
                    raise ParseException(
                        _('Date conversion error: {0}').format(query))
                if '-' in query:
                    field_count = query.count('-') + 1
                else:
                    field_count = query.count('/') + 1

        for v, book_ids in field_iter():
            if isinstance(v, (str, unicode)):
                v = parse_date(v)
            if v is not None and relop(dt_as_local(v), qd, field_count):
                matches |= book_ids

        return matches
    def parse_exported_highlights(self, raw, log_failure=True):
        """
        Extract highlights from pasted Annotations summary, add them to selected book
        in calibre library

        Construct a BookStruct object with the book's metadata.
        Starred items are minimally required.
           BookStruct properties:
            *active: [True|False]
            *author: "John Smith"
             author_sort: (if known)
            *book_id: an int uniquely identifying the book.
                     Highlights are associated with books through book_id
             genre: "Fiction" (if known)
            *title: "The Story of John Smith"
             title_sort: "Story of John Smith, The" (if known)
             uuid: Calibre's uuid for this book, if known

        Construct an AnnotationStruct object with the
        highlight's metadata. Starred items are minimally required. Dashed items
        (highlight_text and note_text) may be one or both.
          AnnotationStruct properties:
            annotation_id: an int uniquely identifying the annotation
           *book_id: The book this annotation is associated with
            highlight_color: [Blue|Gray|Green|Pink|Purple|Underline|Yellow]
           -highlight_text: A list of paragraphs constituting the highlight
            last_modification: The timestamp of the annotation
            location: location of highlight in the book
           -note_text: A list of paragraphs constituting the note
           *timestamp: Unique timestamp of highlight's creation/modification time

        """
        # Create the annotations, books table as needed
        self.annotations_db = "%s_imported_annotations" % self.app_name_
        self.create_annotations_table(self.annotations_db)
        self.books_db = "%s_imported_books" % self.app_name_
        self.create_books_table(self.books_db)

        self.annotated_book_list = []
        self.selected_books = None

        # Generate the book metadata from the selected book
        row = self.opts.gui.library_view.currentIndex()
        book_id = self.opts.gui.library_view.model().id(row)
        db = self.opts.gui.current_db
        mi = db.get_metadata(book_id, index_is_id=True)

        try:
            lines = raw.split('\n')
            if len(lines) < 5:
                raise AnnotationsException("Invalid annotations summary")
            index = 0
            annotations = {}

            # Get the title, author, publisher from the first three lines
            title = lines[index]
            index += 1
            author = lines[index]
            index += 1
            publisher = lines[index]
            index += 1

            # Next line should be the first timestamp/location
            while index < len(lines):
                tsl = re.match(r'^(?P<timestamp>.*) \((?P<location>Page .*)\)', lines[index])
                if tsl:
                    ts = tsl.group('timestamp')
                    isoformat = parse_date(ts, as_utc=False)
                    isoformat = isoformat.replace(hour=12)
                    timestamp = mktime(isoformat.timetuple())
                    while timestamp in annotations:
                        timestamp += 60

                    location = tsl.group('location')
                    index += 1

                    # Continue with highlight
                    highlight_text = lines[index]
                    index += 1

                    # Next line is either Note: or a new tsl
                    note = re.match(r'^Notes: (?P<note_text>.*)', lines[index])
                    note_text = None
                    if note:
                        note_text = note.group('note_text')
                        index += 1

                    if re.match(r'^(?P<timestamp>.*) \((?P<location>Page .*)\)', lines[index]):
                        # New note - store the old one, continue
                        ann = AnnotationStruct()
                        ann.book_id = mi.id
                        ann.annotation_id = index
                        ann.highlight_color = 'Yellow'
                        ann.highlight_text = highlight_text
                        ann.location = location
                        ann.location_sort = "%05d" % int(re.match(r'^Page (?P<page>\d+).*$', location).group('page'))
                        ann.note_text = note_text
                        ann.last_modification = timestamp

                        # Add annotation to db
                        annotations[timestamp] = ann
                        continue
                else:
                    # Store the last one
                    ann = AnnotationStruct()
                    ann.book_id = mi.id
                    ann.annotation_id = index
                    ann.highlight_color = 'Yellow'
                    ann.highlight_text = highlight_text
                    ann.location = location
                    ann.location_sort = "%05d" % int(re.match(r'^Page (?P<page>\d+).*$', location).group('page'))
                    ann.note_text = note_text
                    ann.last_modification = timestamp
                    annotations[timestamp] = ann
                    break
        except:
            if log_failure:
                self._log(" unable to parse %s Annotations" % self.app_name)
                self._log("{:~^80}".format(" Imported Annotation summary "))
                self._log(raw)
                self._log("{:~^80}".format(" end imported Annotations summary "))
                import traceback
                traceback.print_exc()
                msg = ('Unable to parse Annotation summary from %s. ' % self.app_name +
                    'Paste entire contents of emailed summary.')
                MessageBox(MessageBox.WARNING,
                    'Error importing annotations',
                    msg,
                    show_copy_button=False,
                    parent=self.opts.gui).exec_()
                self._log_location("WARNING: %s" % msg)
            return False

        # Populate a BookStruct
        book_mi = BookStruct()
        book_mi.active = True
        book_mi.author = author
        book_mi.book_id = mi.id
        book_mi.title = title
        book_mi.uuid = None
        book_mi.last_update = time.mktime(time.localtime())
        book_mi.reader_app = self.app_name
        book_mi.cid = mi.id
        book_mi.annotations = len(annotations)

        # Add book to books_db
        self.add_to_books_db(self.books_db, book_mi)
        self.annotated_book_list.append(book_mi)

        # Add the annotations
        for timestamp in sorted(annotations.keys()):
            self.add_to_annotations_db(self.annotations_db, annotations[timestamp])
            self.update_book_last_annotation(self.books_db, timestamp, mi.id)
            self.opts.pb.increment()
            self.update_book_last_annotation(self.books_db, timestamp, mi.id)

        # Update the timestamp
        self.update_timestamp(self.annotations_db)
        self.update_timestamp(self.books_db)
        self.commit()

        # Return True if successful
        return True
Beispiel #43
0
    def get_dates_matches(self, location, query, candidates):
        matches = set([])
        if len(query) < 2:
            return matches

        if location == 'date':
            location = 'timestamp'
        loc = self.field_metadata[location]['rec_index']

        if query == 'false':
            for id_ in candidates:
                item = self._data[id_]
                if item is None:
                    continue
                v = item[loc]
                if isinstance(v, (str, unicode)):
                    v = parse_date(v)
                if v is None or v <= UNDEFINED_DATE:
                    matches.add(item[0])
            return matches
        if query == 'true':
            for id_ in candidates:
                item = self._data[id_]
                if item is None:
                    continue
                v = item[loc]
                if isinstance(v, (str, unicode)):
                    v = parse_date(v)
                if v is not None and v > UNDEFINED_DATE:
                    matches.add(item[0])
            return matches

        relop = None
        for k in self.date_search_relops.keys():
            if query.startswith(k):
                (p, relop) = self.date_search_relops[k]
                query = query[p:]
        if relop is None:
                (p, relop) = self.date_search_relops['=']

        if query in self.local_today:
            qd = now()
            field_count = 3
        elif query in self.local_yesterday:
            qd = now() - timedelta(1)
            field_count = 3
        elif query in self.local_thismonth:
            qd = now()
            field_count = 2
        elif query.endswith(self.local_daysago) or query.endswith(self.untrans_daysago):
            num = query[0:-(self.local_daysago_len if query.endswith(self.local_daysago) else self.untrans_daysago_len)]
            try:
                qd = now() - timedelta(int(num))
            except:
                raise ParseException(_('Number conversion error: {0}').format(num))
            field_count = 3
        else:
            try:
                qd = parse_date(query, as_utc=False)
            except:
                raise ParseException(_('Date conversion error: {0}').format(query))
            if '-' in query:
                field_count = query.count('-') + 1
            else:
                field_count = query.count('/') + 1
        for id_ in candidates:
            item = self._data[id_]
            if item is None or item[loc] is None:
                continue
            v = item[loc]
            if isinstance(v, (str, unicode)):
                v = parse_date(v)
            if relop(v, qd, field_count):
                matches.add(item[0])
        return matches
Beispiel #44
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        if title.startswith(r'\376\377'):
            # corrupted XMP packet generated by Nitro PDF. See
            # https://bugs.launchpad.net/calibre/+bug/1541981
            raise ValueError(
                'Corrupted XMP metadata packet detected, probably generated by Nitro PDF'
            )
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = [au for aus in authors for au in string_to_authors(aus)]
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root)
                             or first_simple('//xmp:CreateDate', root),
                             assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = safe_parse_date(first_simple('//xmp:MetadataDate', root))
    mod = safe_parse_date(first_simple('//xmp:ModifyDate', root))
    fd = more_recent(md, mod)
    if fd is not None:
        mi.metadata_date = fd
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = list(filter(None, map(canonicalize_lang, languages)))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple(f'//{namespace}:{scheme}', root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in iteritems({
            'doi': check_doi,
            'isbn': check_isbn
    }):
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #45
0
def get_metadata(stream, extract_cover=True):
    zin = zipfile.ZipFile(stream, 'r')
    odfs = odfmetaparser()
    parser = xml.sax.make_parser()
    parser.setFeature(xml.sax.handler.feature_namespaces, True)
    parser.setFeature(xml.sax.handler.feature_external_ges, False)
    parser.setContentHandler(odfs)
    content = zin.read('meta.xml')
    parser.parse(StringIO(content))
    data = odfs.seenfields
    mi = MetaInformation(None, [])
    if 'title' in data:
        mi.title = data['title']
    if data.get('initial-creator', '').strip():
        mi.authors = string_to_authors(data['initial-creator'])
    elif 'creator' in data:
        mi.authors = string_to_authors(data['creator'])
    if 'description' in data:
        mi.comments = data['description']
    if 'language' in data:
        mi.language = data['language']
    if data.get('keywords', ''):
        mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()]
    opfmeta = False  # we need this later for the cover
    opfnocover = False
    if data.get('opf.metadata','') == 'true':
        # custom metadata contains OPF information
        opfmeta = True
        if data.get('opf.titlesort', ''):
            mi.title_sort = data['opf.titlesort']
        if data.get('opf.authors', ''):
            mi.authors = string_to_authors(data['opf.authors'])
        if data.get('opf.authorsort', ''):
            mi.author_sort = data['opf.authorsort']
        if data.get('opf.isbn', ''):
            isbn = check_isbn(data['opf.isbn'])
            if isbn is not None:
                mi.isbn = isbn
        if data.get('opf.publisher', ''):
            mi.publisher = data['opf.publisher']
        if data.get('opf.pubdate', ''):
            mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
        if data.get('opf.series', ''):
            mi.series = data['opf.series']
            if data.get('opf.seriesindex', ''):
                try:
                    mi.series_index = float(data['opf.seriesindex'])
                except ValueError:
                    mi.series_index = 1.0
        if data.get('opf.language', ''):
            cl = canonicalize_lang(data['opf.language'])
            if cl:
                mi.languages = [cl]
        opfnocover = data.get('opf.nocover', 'false') == 'true'
    if not opfnocover:
        try:
            read_cover(stream, zin, mi, opfmeta, extract_cover)
        except:
            pass  # Do not let an error reading the cover prevent reading other data

    return mi
Beispiel #46
0
def metadata_from_xmp_packet(raw_bytes):
    root = parse_xmp_packet(raw_bytes)
    mi = Metadata(_('Unknown'))
    title = first_alt('//dc:title', root)
    if title:
        mi.title = title
    authors = multiple_sequences('//dc:creator', root)
    if authors:
        mi.authors = authors
    tags = multiple_sequences('//dc:subject', root) or multiple_sequences(
        '//pdf:Keywords', root)
    if tags:
        mi.tags = tags
    comments = first_alt('//dc:description', root)
    if comments:
        mi.comments = comments
    publishers = multiple_sequences('//dc:publisher', root)
    if publishers:
        mi.publisher = publishers[0]
    try:
        pubdate = parse_date(first_sequence('//dc:date', root)
                             or first_simple('//xmp:CreateDate', root),
                             assume_utc=False)
    except:
        pass
    else:
        mi.pubdate = pubdate
    bkp = first_simple('//xmp:CreatorTool', root)
    if bkp:
        mi.book_producer = bkp
    md = first_simple('//xmp:MetadataDate', root)
    if md:
        try:
            mi.metadata_date = parse_date(md)
        except:
            pass
    rating = first_simple('//calibre:rating', root)
    if rating is not None:
        try:
            rating = float(rating)
            if 0 <= rating <= 10:
                mi.rating = rating
        except (ValueError, TypeError):
            pass
    series, series_index = read_series(root)
    if series:
        mi.series, mi.series_index = series, series_index
    for x in ('title_sort', 'author_sort'):
        for elem in XPath('//calibre:' + x)(root):
            val = read_simple_property(elem)
            if val:
                setattr(mi, x, val)
                break
    for x in ('author_link_map', 'user_categories'):
        val = first_simple('//calibre:' + x, root)
        if val:
            try:
                setattr(mi, x, json.loads(val))
            except:
                pass

    languages = multiple_sequences('//dc:language', root)
    if languages:
        languages = filter(None, map(canonicalize_lang, languages))
        if languages:
            mi.languages = languages

    identifiers = {}
    for xmpid in XPath('//xmp:Identifier')(root):
        for scheme, value in read_xmp_identifers(xmpid):
            if scheme and value:
                identifiers[scheme.lower()] = value

    for namespace in ('prism', 'pdfx'):
        for scheme in KNOWN_ID_SCHEMES:
            if scheme not in identifiers:
                val = first_simple('//%s:%s' % (namespace, scheme), root)
                scheme = scheme.lower()
                if scheme == 'isbn':
                    val = check_isbn(val)
                elif scheme == 'doi':
                    val = check_doi(val)
                if val:
                    identifiers[scheme] = val

    # Check Dublin Core for recognizable identifier types
    for scheme, check_func in {
            'doi': check_doi,
            'isbn': check_isbn
    }.iteritems():
        if scheme not in identifiers:
            val = check_func(first_simple('//dc:identifier', root))
            if val:
                identifiers['doi'] = val

    if identifiers:
        mi.set_identifiers(identifiers)

    read_user_metadata(mi, root)

    return mi
Beispiel #47
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         # Author names in Amazon  MOBI files are usually in LN, FN format,
         # try to detect and auto-correct that.
         m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
         if m is not None:
             if tweaks['author_sort_copy_method'] != 'copy':
                 self.mi.authors.append(m.group(2) + ' ' + m.group(1))
             else:
                 self.mi.authors.append(m.group())
             if self.mi.is_null('author_sort'):
                 self.mi.author_sort = m.group()
         else:
             self.mi.authors.append(au)
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments  = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
     elif idx == 109:
         self.mi.rights = clean_xml_chars(self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Beispiel #48
0
def sony_metadata(oeb):
    m = oeb.metadata
    title = short_title = str(m.title[0])
    publisher = __appname__ + ' ' + __version__
    try:
        pt = str(oeb.metadata.publication_type[0])
        short_title = ':'.join(pt.split(':')[2:])
    except:
        pass

    try:
        date = parse_date(str(m.date[0]),
                as_utc=False).strftime('%Y-%m-%d')
    except:
        date = strftime('%Y-%m-%d')
    try:
        language = str(m.language[0]).replace('_', '-')
    except:
        language = 'en'
    short_title = xml(short_title, True)

    metadata = SONY_METADATA.format(title=xml(title),
            short_title=short_title,
            publisher=xml(publisher), issue_date=xml(date),
            language=xml(language))

    updated = strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime())

    def cal_id(x):
        for k, v in x.attrib.items():
            if k.endswith('scheme') and v == 'uuid':
                return True

    try:
        base_id = str(list(filter(cal_id, m.identifier))[0])
    except:
        base_id = str(uuid4())

    toc = oeb.toc

    if False and toc.depth() < 3:
        # Single section periodical
        # Disabled since I prefer the current behavior
        from calibre.ebooks.oeb.base import TOC
        section = TOC(klass='section', title=_('All articles'),
                    href=oeb.spine[2].href)
        for x in toc:
            section.nodes.append(x)
        toc = TOC(klass='periodical', href=oeb.spine[2].href,
                    title=str(oeb.metadata.title[0]))
        toc.nodes.append(section)

    entries = []
    seen_titles = set()
    for i, section in enumerate(toc):
        if not section.href:
            continue
        secid = 'section%d'%i
        sectitle = section.title
        if not sectitle:
            sectitle = _('Unknown')
        d = 1
        bsectitle = sectitle
        while sectitle in seen_titles:
            sectitle = bsectitle + ' ' + str(d)
            d += 1
        seen_titles.add(sectitle)
        sectitle = xml(sectitle, True)
        secdesc = section.description
        if not secdesc:
            secdesc = ''
        secdesc = xml(secdesc)
        entries.append(SONY_ATOM_SECTION.format(title=sectitle,
            href=section.href, id=xml(base_id)+'/'+secid,
            short_title=short_title, desc=secdesc, updated=updated))

        for j, article in enumerate(section):
            if not article.href:
                continue
            atitle = article.title
            btitle = atitle
            d = 1
            while atitle in seen_titles:
                atitle = btitle + ' ' + str(d)
                d += 1

            auth = article.author if article.author else ''
            desc = section.description
            if not desc:
                desc = ''
            aid = 'article%d'%j

            entries.append(SONY_ATOM_ENTRY.format(
                title=xml(atitle),
                author=xml(auth),
                updated=updated,
                desc=desc,
                short_title=short_title,
                section_title=sectitle,
                href=article.href,
                word_count=str(1),
                id=xml(base_id)+'/'+secid+'/'+aid
            ))

    atom = SONY_ATOM.format(short_title=short_title,
            entries='\n\n'.join(entries), updated=updated,
            id=xml(base_id)).encode('utf-8')

    return metadata, atom
Beispiel #49
0
def get_metadata(stream, extract_cover=True):
    whitespace = re.compile(r'\s+')

    def normalize(s):
        return whitespace.sub(' ', s).strip()

    with ZipFile(stream) as zf:
        meta = zf.read('meta.xml')
        root = fromstring(meta)

        def find(field):
            ns, tag = fields[field]
            ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns})
            if ans:
                return normalize(
                    tostring(ans[0],
                             method='text',
                             encoding='unicode',
                             with_tail=False)).strip()

        mi = MetaInformation(None, [])
        title = find('title')
        if title:
            mi.title = title
        creator = find('initial-creator') or find('creator')
        if creator:
            mi.authors = string_to_authors(creator)
        desc = find('description')
        if desc:
            mi.comments = desc
        lang = find('language')
        if lang and canonicalize_lang(lang):
            mi.languages = [canonicalize_lang(lang)]
        kw = find('keyword') or find('keywords')
        if kw:
            mi.tags = [x.strip() for x in kw.split(',') if x.strip()]
        data = {}
        for tag in root.xpath('//ns0:user-defined',
                              namespaces={'ns0': fields['user-defined'][0]}):
            name = (tag.get('{%s}name' % METANS) or '').lower()
            vtype = tag.get('{%s}value-type' % METANS) or 'string'
            val = tag.text
            if name and val:
                if vtype == 'boolean':
                    val = val == 'true'
                data[name] = val
        opfmeta = False  # we need this later for the cover
        opfnocover = False
        if data.get('opf.metadata'):
            # custom metadata contains OPF information
            opfmeta = True
            if data.get('opf.titlesort', ''):
                mi.title_sort = data['opf.titlesort']
            if data.get('opf.authors', ''):
                mi.authors = string_to_authors(data['opf.authors'])
            if data.get('opf.authorsort', ''):
                mi.author_sort = data['opf.authorsort']
            if data.get('opf.isbn', ''):
                isbn = check_isbn(data['opf.isbn'])
                if isbn is not None:
                    mi.isbn = isbn
            if data.get('opf.publisher', ''):
                mi.publisher = data['opf.publisher']
            if data.get('opf.pubdate', ''):
                mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True)
            if data.get('opf.identifiers'):
                try:
                    mi.identifiers = json.loads(data['opf.identifiers'])
                except Exception:
                    pass
            if data.get('opf.rating'):
                try:
                    mi.rating = max(0, min(float(data['opf.rating']), 10))
                except Exception:
                    pass
            if data.get('opf.series', ''):
                mi.series = data['opf.series']
                if data.get('opf.seriesindex', ''):
                    try:
                        mi.series_index = float(data['opf.seriesindex'])
                    except Exception:
                        mi.series_index = 1.0
            if data.get('opf.language', ''):
                cl = canonicalize_lang(data['opf.language'])
                if cl:
                    mi.languages = [cl]
            opfnocover = data.get('opf.nocover', False)
        if not opfnocover:
            try:
                read_cover(stream, zf, mi, opfmeta, extract_cover)
            except Exception:
                pass  # Do not let an error reading the cover prevent reading other data

    return mi
Beispiel #50
0
 def process_metadata(self, idx, content, codec):
     if idx == 100:
         if self.mi.is_null('authors'):
             self.mi.authors = []
         au = clean_xml_chars(self.decode(content).strip())
         self.mi.authors.append(au)
         if self.mi.is_null('author_sort') and re.match(
                 r'\S+?\s*,\s+\S+', au.strip()):
             self.mi.author_sort = au.strip()
     elif idx == 101:
         self.mi.publisher = clean_xml_chars(self.decode(content).strip())
         if self.mi.publisher in {'Unknown', _('Unknown')}:
             self.mi.publisher = None
     elif idx == 103:
         self.mi.comments = clean_xml_chars(self.decode(content).strip())
     elif idx == 104:
         raw = check_isbn(self.decode(content).strip().replace('-', ''))
         if raw:
             self.mi.isbn = raw
     elif idx == 105:
         if not self.mi.tags:
             self.mi.tags = []
         self.mi.tags.extend([
             x.strip()
             for x in clean_xml_chars(self.decode(content)).split(';')
         ])
         self.mi.tags = list(set(self.mi.tags))
     elif idx == 106:
         try:
             self.mi.pubdate = parse_date(content, as_utc=False)
         except:
             pass
     elif idx == 108:
         self.mi.book_producer = clean_xml_chars(
             self.decode(content).strip())
     elif idx == 112:  # dc:source set in some EBSP amazon samples
         try:
             content = content.decode(codec).strip()
             isig = 'urn:isbn:'
             if content.lower().startswith(isig):
                 raw = check_isbn(content[len(isig):])
                 if raw and not self.mi.isbn:
                     self.mi.isbn = raw
             elif content.startswith('calibre:'):
                 # calibre book uuid is stored here by recent calibre
                 # releases
                 cid = content[len('calibre:'):]
                 if cid:
                     self.mi.application_id = self.mi.uuid = cid
         except:
             pass
     elif idx == 113:  # ASIN or other id
         try:
             self.uuid = content.decode('ascii')
             self.mi.set_identifier('mobi-asin', self.uuid)
         except:
             self.uuid = None
     elif idx == 116:
         self.start_offset, = struct.unpack(b'>L', content)
     elif idx == 121:
         self.kf8_header, = struct.unpack(b'>L', content)
         if self.kf8_header == NULL_INDEX:
             self.kf8_header = None
Beispiel #51
0
def to_metadata(browser, log, entry_, timeout):  # {{{
    from lxml import etree
    XPath = partial(etree.XPath, namespaces=NAMESPACES)

    # total_results  = XPath('//openSearch:totalResults')
    # start_index    = XPath('//openSearch:startIndex')
    # items_per_page = XPath('//openSearch:itemsPerPage')
    entry = XPath('//atom:entry')
    entry_id = XPath('descendant::atom:id')
    creator = XPath('descendant::dc:creator')
    identifier = XPath('descendant::dc:identifier')
    title = XPath('descendant::dc:title')
    date = XPath('descendant::dc:date')
    publisher = XPath('descendant::dc:publisher')
    subject = XPath('descendant::dc:subject')
    description = XPath('descendant::dc:description')
    language = XPath('descendant::dc:language')
    rating = XPath('descendant::gd:rating[@average]')

    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    google_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'google': google_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(
            xml_to_unicode(clean_ascii_chars(raw),
                           strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi

    mi.comments = get_text(extra, description)
    lang = canonicalize_lang(get_text(extra, language))
    if lang:
        mi.language = lang
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in identifier(extra):
        t = str(x.text).strip()
        if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'):
            if t[:5].upper() == 'ISBN:':
                t = check_isbn(t[5:])
                if t:
                    isbns.append(t)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x.text for x in subject(extra) if x.text]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        from calibre.utils.date import parse_date, utcnow
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r' % pubdate)

    # Ratings
    for x in rating(extra):
        try:
            mi.rating = float(x.get('average'))
            if mi.rating > 5:
                mi.rating /= 2
        except:
            log.exception('Failed to parse rating')

    # Cover
    mi.has_google_cover = None
    for x in extra.xpath(
            '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]'
    ):
        mi.has_google_cover = x.get('href')
        break

    return mi
Beispiel #52
0
def get_metadata_(src, encoding=None):
    # Meta data definitions as in
    # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9

    if isbytestring(src):
        if not encoding:
            src = xml_to_unicode(src)[0]
        else:
            src = src.decode(encoding, 'replace')
    src = src[:150000]  # Searching shouldn't take too long
    comment_tags = parse_comment_tags(src)
    meta_tags = parse_meta_tags(src)

    def get(field):
        ans = comment_tags.get(field, meta_tags.get(field, None))
        if ans:
            ans = ans.strip()
        if not ans:
            ans = None
        return ans

    # Title
    title = get('title')
    if not title:
        pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE)
        match = pat.search(src)
        if match:
            title = replace_entities(match.group(1))

    # Author
    authors = get('authors') or _('Unknown')

    # Create MetaInformation with Title and Author
    mi = Metadata(title or _('Unknown'), string_to_authors(authors))

    for field in ('publisher', 'isbn', 'language', 'comments'):
        val = get(field)
        if val:
            setattr(mi, field, val)

    for field in ('pubdate', 'timestamp'):
        try:
            val = parse_date(get(field))
        except:
            pass
        else:
            if not is_date_undefined(val):
                setattr(mi, field, val)

    # SERIES
    series = get('series')
    if series:
        pat = re.compile(r'\[([.0-9]+)\]$')
        match = pat.search(series)
        series_index = None
        if match is not None:
            try:
                series_index = float(match.group(1))
            except:
                pass
            series = series.replace(match.group(), '').strip()
        mi.series = series
        if series_index is None:
            series_index = get('series_index')
            try:
                series_index = float(series_index)
            except:
                pass
        if series_index is not None:
            mi.series_index = series_index

    # RATING
    rating = get('rating')
    if rating:
        try:
            mi.rating = float(rating)
            if mi.rating < 0:
                mi.rating = 0
            if mi.rating > 5:
                mi.rating /= 2.
            if mi.rating > 5:
                mi.rating = 0
        except:
            pass

    # TAGS
    tags = get('tags')
    if tags:
        tags = [x.strip() for x in tags.split(',') if x.strip()]
        if tags:
            mi.tags = tags

    return mi
Beispiel #53
0
    def _parse_my_clippings_original(self):
        '''
        Parse MyClippings.txt for entries matching installed books.
        File should end with SEPARATOR and a newline.
        '''
        SEPARATOR = '=========='
        cp = self._get_my_clippings()
        timestamp_parse_failed = False
        if cp:
            lines = []
            # Apparently new MyClippings.txt files are encoded UTF-8 with BOM
            with open(cp) as clippings:
                for line in clippings:
                    stripped = line.decode('utf-8-sig')
                    lines.append(stripped)

            index = 0
            line = lines[index]
            while True:
                # Get to the first title (author_sort) line
                if re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', lines[index]):
                    break
                else:
                    while not re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', lines[index]):
                        index += 1
                    break

            while index < len(lines) - 1:
                try:
                    line = lines[index]
                    book_id = None

                    # 1. Get the title/author_sort pair
                    tas = re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', line)
                    title = tas.group('title').rstrip()
                    author_sort = tas.group('author_sort')
                    # If title/author_sort match book in library,
                    # consider this an active annotation
                    if title in self.installed_books_by_title.keys():
                        book_id = self.installed_books_by_title[title]['book_id']
                    index += 1

                    # 2. Get [Highlight|Bookmark Location|Note]
                    line = lines[index]
                    ann_type = None
                    if 'Highlight' in line:
                        ann_type = 'Highlight'
                    elif 'Bookmark' in line:
                        ann_type = 'Bookmark'
                    elif 'Note' in line:
                        ann_type = 'Note'
                    # Kindle PW uses 'Location', K3 uses 'Loc.'. German uses 'Position'
                    # K3 does not store location with Bookmarks. Whatever.
                    loc = re.match(r'.* (?P<location>(Location|Loc\.|Position) [0-9,-]+).*', line)
                    location = 'Unknown'
                    location_sort = "000000"
                    if loc:
                        location = loc.group('location')
                        location_sort = "%06d" % int(re.match(r'^(Loc\.|Location|Position) (?P<loc>[0-9]+).*$', location).group('loc'))

                    # Try to read the timestamp, fallback to local time
                    try:
                        tstring = re.match(r'.*Added on (?P<timestamp>.*$)', line)
                        ts = tstring.group('timestamp')
                        isoformat = parse_date(ts, as_utc=False)
                        timestamp = mktime(isoformat.timetuple())
                    except:
                        if not timestamp_parse_failed:
                            self._log(" Unable to parse entries from 'My Clippings.txt'")
                            self._log(" %s driver supports English only." % self.app_name)
                            timestamp_parse_failed = True
                        timestamp = mktime(localtime())
                        while timestamp in self.active_annotations:
                            timestamp += 1
                    index += 1

                    # 3. blank line(s)
                    while lines[index].strip() == '':
                        index += 1

                    # 4. highlight or note
                    item = lines[index]
                    highlight_text = None
                    note_text = None
                    if ann_type == 'Highlight':
                        highlight_text = [unicode(item)]
                        index += 1
                        while lines[index].strip() != SEPARATOR:
                            highlight_text.append(unicode(lines[index]))
                            index += 1
                    elif ann_type == 'Note':
                        note_text = [unicode(item)]
                        index += 1
                        while lines[index].strip() != SEPARATOR:
                            note_text.append(unicode(lines[index]))
                            index += 1
                    # Pass SEPARATOR
                    index += 1

                    # 5. Store the active_annotation
                    if book_id:
                        # Notes and highlights are created simultaneously
                        if timestamp not in self.active_annotations:
                            self.active_annotations[timestamp] = {
                                'annotation_id': timestamp,
                                'book_id': book_id,
                                'highlight_color': 'Gray',
                                'location': location,
                                'location_sort': location_sort
                                }
                        if highlight_text is not None:
                            self.active_annotations[timestamp]['highlight_text'] = highlight_text
                        if note_text is not None:
                            self.active_annotations[timestamp]['note_text'] = note_text
                except:
                    # Unexpected EOF. Return with whatever we have
                    self._log_location("failed with line: %s" % repr(line))
                    import traceback
                    traceback.print_exc()
                    return
Beispiel #54
0
 def adapt_datetime(x, d):
     if isinstance(x, (unicode_type, bytes)):
         x = parse_date(x, assume_utc=False, as_utc=False)
     return x
Beispiel #55
0
    def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log):
        from html5_parser import parse
        from lxml import html
        from calibre.ebooks.chardet import xml_to_unicode
        from calibre.library.comments import sanitize_comments_html

        try:
            raw = br.open_novisit(metadata_url).read()
        except Exception as e:
            if callable(getattr(e, 'getcode', None)) and \
                    e.getcode() == 404:
                return False
            raise
        raw = xml_to_unicode(raw,
                             strip_encoding_pats=True,
                             resolve_entities=True)[0]

        try:
            root = parse(raw, maybe_xhtml=False, sanitize_names=True)
        except Exception:
            return False

        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {
                'english': 'eng',
                'french': 'fra',
                'german': 'deu',
                'spanish': 'spa'
            }.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print("ebook isbn is "+type('')(ebook_isbn[0]))
            isbn = check_isbn(ebook_isbn[0].strip())
            if isbn:
                self.cache_isbn_to_identifier(isbn, ovrdrv_id)
                mi.isbn = isbn
        if subjects:
            mi.tags = [tag.strip() for tag in subjects[0].split(',')]

        if desc:
            desc = desc[0]
            desc = html.tostring(desc, method='html',
                                 encoding='unicode').strip()
            # remove all attributes from tags
            desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc)
            # Remove comments
            desc = re.sub(r'(?s)<!--.*?-->', '', desc)
            mi.comments = sanitize_comments_html(desc)

        return None
Beispiel #56
0
    def itervals(self, record):
        for name, fm in self.entries:
            dt = fm['datatype']
            val = record[fm['rec_index']]
            if dt == 'composite':
                sb = fm['display'].get('composite_sort', 'text')
                if sb == 'date':
                    try:
                        val = parse_date(val)
                    except:
                        val = UNDEFINED_DATE
                    dt = 'datetime'
                elif sb == 'number':
                    try:
                        p = 1
                        for i, candidate in enumerate(
                            ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')):
                            if val.endswith(candidate):
                                p = 1024**(i)
                                val = val[:-len(candidate)].strip()
                                break
                        val = locale.atof(val) * p
                    except:
                        val = 0.0
                    dt = 'float'
                elif sb == 'bool':
                    val = force_to_bool(val)
                    dt = 'bool'

            if dt == 'datetime':
                if val is None:
                    val = UNDEFINED_DATE
                if tweaks['sort_dates_using_visible_fields']:
                    format = None
                    if name == 'timestamp':
                        format = tweaks['gui_timestamp_display_format']
                    elif name == 'pubdate':
                        format = tweaks['gui_pubdate_display_format']
                    elif name == 'last_modified':
                        format = tweaks['gui_last_modified_display_format']
                    elif fm['is_custom']:
                        format = fm['display'].get('date_format', None)
                    val = clean_date_for_sort(val, format)
            elif dt == 'series':
                if val is None:
                    val = ('', 1)
                else:
                    if self.library_order:
                        try:
                            lang = record[self.lang_idx].partition(u',')[0]
                        except (AttributeError, ValueError, KeyError,
                                IndexError, TypeError):
                            lang = None
                        val = title_sort(val, order='library_order', lang=lang)
                    sidx_fm = self.field_metadata[name + '_index']
                    sidx = record[sidx_fm['rec_index']]
                    val = (self.string_sort_key(val), sidx)

            elif dt in ('text', 'comments', 'composite', 'enumeration'):
                if val:
                    if fm['is_multiple']:
                        jv = fm['is_multiple']['list_to_ui']
                        sv = fm['is_multiple']['cache_to_list']
                        if '&' in jv:
                            val = jv.join([
                                author_to_author_sort(v) for v in val.split(sv)
                            ])
                        else:
                            val = jv.join(
                                sorted(val.split(sv),
                                       key=self.string_sort_key))
                val = self.string_sort_key(val)

            elif dt == 'bool':
                if not self.db_prefs.get('bools_are_tristate'):
                    val = {True: 1, False: 2, None: 2}.get(val, 2)
                else:
                    val = {True: 1, False: 2, None: 3}.get(val, 3)

            yield val
Beispiel #57
0
    def to_metadata(self, browser, log, entry_, timeout):  # {{{
        from calibre.utils.date import parse_date, utcnow

        douban_id = entry_.get("id")
        title = entry_.get("title")
        description = entry_.get("summary")
        # subtitle = entry_.get('subtitle')  # TODO: std metada doesn't have this field
        publisher = entry_.get("publisher")
        isbn = entry_.get("isbn13")  # ISBN11 is obsolute, use ISBN13
        pubdate = entry_.get("pubdate")
        authors = entry_.get("author")
        book_tags = entry_.get("tags")
        rating = entry_.get("rating")
        cover_url = entry_.get("images", {}).get("large")
        series = entry_.get("series")

        if not authors:
            authors = [_("Unknown")]
        if not douban_id or not title:
            # Silently discard this entry
            return None

        mi = Metadata(title, authors)
        mi.identifiers = {"douban": douban_id}
        mi.publisher = publisher
        mi.comments = description
        # mi.subtitle = subtitle

        # ISBN
        isbns = []
        if isinstance(isbn, (type(""), bytes)):
            if check_isbn(isbn):
                isbns.append(isbn)
        else:
            for x in isbn:
                if check_isbn(x):
                    isbns.append(x)
        if isbns:
            mi.isbn = sorted(isbns, key=len)[-1]
        mi.all_isbns = isbns

        # Tags
        mi.tags = [tag["name"] for tag in book_tags]

        # pubdate
        if pubdate:
            try:
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
            except:
                log.error("Failed to parse pubdate %r" % pubdate)

        # Ratings
        if rating:
            try:
                mi.rating = float(rating["average"]) / 2.0
            except:
                log.exception("Failed to parse rating")
                mi.rating = 0

        # Cover
        mi.has_douban_cover = None
        u = cover_url
        if u:
            # If URL contains "book-default", the book doesn't have a cover
            if u.find("book-default") == -1:
                mi.has_douban_cover = u

        # Series
        if series:
            mi.series = series["title"]

        return mi
Beispiel #58
0
        pub_date = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()")
        lang = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()")
        subjects = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()")
        ebook_isbn = root.xpath(
            "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()")
        desc = root.xpath(
            "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]"
        )

        if pub_date:
            from calibre.utils.date import parse_date
            try:
                mi.pubdate = parse_date(pub_date[0].strip())
            except:
                pass
        if lang:
            lang = lang[0].strip().lower()
            lang = {
                'english': 'eng',
                'french': 'fra',
                'german': 'deu',
                'spanish': 'spa'
            }.get(lang, None)
            if lang:
                mi.language = lang

        if ebook_isbn:
            # print "ebook isbn is "+str(ebook_isbn[0])
Beispiel #59
0
def to_metadata(browser, log, entry_, timeout): # {{{
    from lxml import etree
    from calibre.ebooks.chardet import xml_to_unicode
    from calibre.utils.date import parse_date, utcnow
    from calibre.utils.cleantext import clean_ascii_chars

    XPath = partial(etree.XPath, namespaces=NAMESPACES)
    entry          = XPath('//atom:entry')
    entry_id       = XPath('descendant::atom:id')
    title          = XPath('descendant::atom:title')
    description    = XPath('descendant::atom:summary')
    publisher      = XPath("descendant::db:attribute[@name='publisher']")
    isbn           = XPath("descendant::db:attribute[@name='isbn13']")
    date           = XPath("descendant::db:attribute[@name='pubdate']")
    creator        = XPath("descendant::db:attribute[@name='author']")
    booktag        = XPath("descendant::db:tag/attribute::name")
    rating         = XPath("descendant::gd:rating/attribute::average")
    cover_url      = XPath("descendant::atom:link[@rel='image']/attribute::href")


    def get_text(extra, x):
        try:
            ans = x(extra)
            if ans:
                ans = ans[0].text
                if ans and ans.strip():
                    return ans.strip()
        except:
            log.exception('Programming error:')
        return None

    id_url = entry_id(entry_)[0].text
    douban_id = id_url.split('/')[-1]
    title_ = ': '.join([x.text for x in title(entry_)]).strip()
    authors = [x.text.strip() for x in creator(entry_) if x.text]
    if not authors:
        authors = [_('Unknown')]
    if not id_url or not title:
        # Silently discard this entry
        return None

    mi = Metadata(title_, authors)
    mi.identifiers = {'douban':douban_id}
    try:
        raw = get_details(browser, id_url, timeout)
        feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw),
            strip_encoding_pats=True)[0])
        extra = entry(feed)[0]
    except:
        log.exception('Failed to get additional details for', mi.title)
        return mi
    mi.comments = get_text(extra, description)
    mi.publisher = get_text(extra, publisher)

    # ISBN
    isbns = []
    for x in [t.text for t in isbn(extra)]:
        if check_isbn(x):
            isbns.append(x)
    if isbns:
        mi.isbn = sorted(isbns, key=len)[-1]
    mi.all_isbns = isbns

    # Tags
    try:
        btags = [x for x in booktag(extra) if x]
        tags = []
        for t in btags:
            atags = [y.strip() for y in t.split('/')]
            for tag in atags:
                if tag not in tags:
                    tags.append(tag)
    except:
        log.exception('Failed to parse tags:')
        tags = []
    if tags:
        mi.tags = [x.replace(',', ';') for x in tags]

    # pubdate
    pubdate = get_text(extra, date)
    if pubdate:
        try:
            default = utcnow().replace(day=15)
            mi.pubdate = parse_date(pubdate, assume_utc=True, default=default)
        except:
            log.error('Failed to parse pubdate %r'%pubdate)

    # Ratings
    if rating(extra):
        try:
            mi.rating = float(rating(extra)[0]) / 2.0
        except:
            log.exception('Failed to parse rating')
            mi.rating = 0

    # Cover
    mi.has_douban_cover = None
    u = cover_url(extra)
    if u:
        u = u[0].replace('/spic/', '/lpic/');
        # If URL contains "book-default", the book doesn't have a cover
        if u.find('book-default') == -1:
            mi.has_douban_cover = u
    return mi
    def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout):

        detail_url = self.BOKELAI_DETAIL_URL % bokelai_id
        log.info(detail_url)

        try:
            br = self.browser
            _raw = br.open_novisit(detail_url, timeout=timeout)
            raw = _raw.read()
        except Exception as e:
            log.exception('Failed to load detail page: %s' % detail_url)
            return

        root = etree.HTML(raw)
        info_json_text = root.xpath(
            "//script[@type='application/ld+json']")[0].text
        log.info(info_json_text)
        info_json = json.loads(info_json_text)

        title = info_json['name']
        authors = info_json['author'][0]['name'].split(",")
        publisher = info_json['publisher'][0]['name']
        isbn = info_json['workExample']['workExample']['isbn']
        pubdate = info_json['datePublished']

        comments = ""
        comments_ele = root.xpath("(//div[@class='content'])[1]//text()")
        comments = "\n".join(comments_ele)

        tags = list()
        for ele in root.xpath("//li[contains(text(),'本書分類:')]/a"):
            log.info(ele.text)
            if "/" in ele.text:
                tags.extend(ele.text.split("/"))
            if "/" in ele.text:
                tags.extend(ele.text.split("/"))
            else:
                tags.append(ele.text)

        cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*',
                              info_json['image']).group(0)

        if not authors:
            authors = [_('Unknown')]

        log.info(title, authors, publisher, isbn, pubdate, comments, tags,
                 cover_url)

        mi = Metadata(title, authors)
        mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn}
        mi.publisher = publisher
        mi.comments = comments
        mi.isbn = isbn
        mi.tags = tags
        if pubdate:
            try:
                from calibre.utils.date import parse_date, utcnow
                default = utcnow().replace(day=15)
                mi.pubdate = parse_date(pubdate,
                                        assume_utc=True,
                                        default=default)
            except:
                log.error('Failed to parse pubdate %r' % pubdate)

        if not cover_url is None:
            mi.has_bokelai_cover = cover_url
            self.cache_identifier_to_cover_url(mi.identifiers['bokelai'],
                                               mi.has_bokelai_cover)
        else:
            mi.has_bokelai_cover = None

        result_queue.put(mi)