def evaluate(self, formatter, kwargs, mi, locals, date1, date2): try: d1 = parse_date(date1) if d1 == UNDEFINED_DATE: return "" d2 = parse_date(date2) if d2 == UNDEFINED_DATE: return "" except: return "" i = d1 - d2 return str("%d.%d" % (i.days, i.seconds / 8640))
def save_serialized_to_disk(ids, data, plugboards, root, opts, callback): from calibre.ebooks.metadata.opf2 import OPF root, opts, length = _sanitize_args(root, opts) failures = [] for x in ids: opf, cover, format_map, last_modified = data[x] if isinstance(opf, unicode): opf = opf.encode('utf-8') mi = OPF(cStringIO.StringIO(opf)).to_book_metadata() try: mi.last_modified = parse_date(last_modified) except: pass tb = '' try: with open(cover, 'rb') as f: cover = f.read() except: cover = None try: failed, id, title = do_save_book_to_disk(x, mi, cover, plugboards, format_map, root, opts, length) tb = _('Requested formats not available') except: failed, id, title = True, x, mi.title tb = traceback.format_exc() if failed: failures.append((id, title, tb)) if callable(callback): if not callback(int(id), title, failed, tb): break return failures
def fix_pubdates(self): from calibre.utils.date import parse_date, strptime dirtied = False opf = self.container.opf for dcdate in opf.xpath('//dc:date', namespaces={'dc':'http://purl.org/dc/elements/1.1/'}): raw = dcdate.text if not raw: raw = '' default = strptime('2000-1-1', '%Y-%m-%d', as_utc=True) try: ts = parse_date(raw, assume_utc=False, as_utc=True, default=default) except: raise InvalidEpub('Invalid date set in OPF', raw) try: sval = ts.strftime('%Y-%m-%d') except: from calibre import strftime sval = strftime('%Y-%m-%d', ts.timetuple()) if sval != raw: self.log.error( 'OPF contains date', raw, 'that epubcheck does not like') if self.fix: dcdate.text = sval self.log('\tReplaced', raw, 'with', sval) dirtied = True if dirtied: self.container.set(self.container.opf_name, opf)
def consolidate_metadata(info_mi, info): """ When both the PDF Info dict and XMP metadata are present, prefer the xmp metadata unless the Info ModDate is never than the XMP MetadataDate. This is the algorithm recommended by the PDF spec. """ try: xmp_mi = metadata_from_xmp_packet(info["xmp_metadata"]) except: import traceback traceback.print_exc() return info_mi info_title, info_authors, info_tags = ( info_mi.title or _("Unknown"), list(info_mi.authors or ()), list(info_mi.tags or ()), ) info_mi.smart_update(xmp_mi, replace_metadata=True) prefer_info = False if "ModDate" in info and hasattr(xmp_mi, "metadata_date"): try: info_date = parse_date(info["ModDate"]) except: pass else: prefer_info = info_date > xmp_mi.metadata_date if prefer_info: info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags else: # We'll use the xmp tags/authors but fallback to the info ones if the # xmp does not have tags/authors. smart_update() should have taken care of # the rest info_mi.authors, info_mi.tags = xmp_mi.authors or info_mi.authors, xmp_mi.tags or info_mi.tags return info_mi
def _parse_timestamp(root, mi): #<date value="1996-12-03">03.12.1996</date> xp ='//fb2:document-info/fb2:date/@value|'\ '//fb2:document-info/fb2:date/text()' docdate = XPath('string(%s)' % xp)(root) if docdate: mi.timestamp = parse_date(docdate)
def consolidate_metadata(info_mi, info): ''' When both the PDF Info dict and XMP metadata are present, prefer the xmp metadata unless the Info ModDate is never than the XMP MetadataDate. This is the algorithm recommended by the PDF spec. ''' try: raw = info['xmp_metadata'].rstrip() if not raw: return info_mi xmp_mi = metadata_from_xmp_packet(raw) except Exception: import traceback traceback.print_exc() return info_mi info_title, info_authors, info_tags = info_mi.title or _('Unknown'), list(info_mi.authors or ()), list(info_mi.tags or ()) info_mi.smart_update(xmp_mi, replace_metadata=True) prefer_info = False if 'ModDate' in info and hasattr(xmp_mi, 'metadata_date'): try: info_date = parse_date(info['ModDate']) except Exception: pass else: prefer_info = info_date > xmp_mi.metadata_date if prefer_info: info_mi.title, info_mi.authors, info_mi.tags = info_title, info_authors, info_tags else: # We'll use the xmp tags/authors but fallback to the info ones if the # xmp does not have tags/authors. smart_update() should have taken care of # the rest info_mi.authors, info_mi.tags = (info_authors if xmp_mi.is_null('authors') else xmp_mi.authors), xmp_mi.tags or info_tags return info_mi
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, 1) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if 'title' in data: mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif 'creator' in data: mi.authors = string_to_authors(data['creator']) if 'description' in data: mi.comments = data['description'] if 'language' in data: mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def convert_kobo_date(kobo_date): from calibre.utils.date import utc_tz try: converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S.%f") converted_date = datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S.%f' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S%+00:00") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date)) except: try: from calibre.utils.date import parse_date converted_date = parse_date(kobo_date, assume_utc=True) # debug_print("convert_kobo_date - parse_date - kobo_date=%s' - kobo_date={0}'".format(kobo_date)) except: # try: # converted_date = time.gmtime(os.path.getctime(self.path)) # debug_print("convert_kobo_date - time.gmtime(os.path.getctime(self.path)) - kobo_date={0}'".format(kobo_date)) # except: converted_date = time.gmtime() debug_print("convert_kobo_date - time.gmtime() - kobo_date={0}'".format(kobo_date)) return converted_date
def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None, thumbnail_name=None, size=None, other=None): from calibre.utils.date import parse_date # debug_print('Book::__init__ - title=', title) show_debug = title is not None and title.lower().find("xxxxx") >= 0 if other is not None: other.title = title other.published_date = date if show_debug: debug_print("Book::__init__ - title=", title, 'authors=', authors) debug_print("Book::__init__ - other=", other) super(Book, self).__init__(prefix, lpath, size, other) if title is not None and len(title) > 0: self.title = title if authors is not None and len(authors) > 0: self.authors_from_string(authors) if self.author_sort is None or self.author_sort == "Unknown": self.author_sort = author_to_author_sort(authors) self.mime = mime self.size = size # will be set later if None if ContentType == '6' and date is not None: try: self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") except: try: self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%dT%H:%M:%S") except: try: self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%d") except: try: self.datetime = parse_date(date, assume_utc=True).timetuple() except: try: self.datetime = time.gmtime(os.path.getctime(self.path)) except: self.datetime = time.gmtime() self.kobo_metadata = Metadata(title, self.authors) self.contentID = None self.current_shelves = [] self.kobo_collections = [] self.can_put_on_shelves = True self.kobo_series = None self.kobo_series_number = None # Kobo stores the series number as string. And it can have a leading "#". self.kobo_subtitle = None if thumbnail_name is not None: self.thumbnail = ImageWrapper(thumbnail_name) if show_debug: debug_print("Book::__init__ end - self=", self) debug_print("Book::__init__ end - title=", title, 'authors=', authors)
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath("//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath("//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]") if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = {'english':'eng', 'french':'fra', 'german':'deu', 'spanish':'spa'}.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0]) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def from_json(obj): if '__class__' in obj: if obj['__class__'] == 'bytearray': return bytearray(base64.standard_b64decode(obj['__value__'])) if obj['__class__'] == 'datetime.datetime': from calibre.utils.date import parse_date return parse_date(obj['__value__'], assume_utc=True) return obj
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def string_to_datetime(src): from calibre.utils.date import parse_date if src != "None": try: return parse_date(src) except Exception: pass return None
def evaluate(self, formatter, kwargs, mi, locals, val, format_string): if not val or val == "None": return "" try: dt = parse_date(val) s = format_date(dt, format_string) except: s = "BAD DATE" return s
def get_date(self, entry, verbose): try: d = date(entry) if d: default = utcnow().replace(day=15) d = parse_date(d[0].text, assume_utc=True, default=default) else: d = None except: report(verbose) d = None return d
def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None, thumbnail_name=None, size=None, other=None): # debug_print('Book::__init__ - title=', title) show_debug = title is not None and title.lower().find("xxxxx") >= 0 if show_debug: debug_print("Book::__init__ - title=", title, 'authors=', authors) debug_print("Book::__init__ - other=", other) Book_.__init__(self, prefix, lpath, size, other) if title is not None and len(title) > 0: self.title = title if authors is not None and len(authors) > 0: self.authors_from_string(authors) if self.author_sort is None or self.author_sort == "Unknown": self.author_sort = author_to_author_sort(authors) self.mime = mime self.size = size # will be set later if None if ContentType == '6' and date is not None: try: self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") except: try: self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%dT%H:%M:%S") except: try: self.datetime = time.strptime(date.split('+')[0], "%Y-%m-%d") except: try: self.datetime = parse_date(date, assume_utc=True).timetuple() except: try: self.datetime = time.gmtime(os.path.getctime(self.path)) except: self.datetime = time.gmtime() self.contentID = None self.current_shelves = [] self.kobo_collections = [] self.kobo_series = None self.kobo_series_number = None if thumbnail_name is not None: self.thumbnail = ImageWrapper(thumbnail_name) if show_debug: debug_print("Book::__init__ end - self=", self) debug_print("Book::__init__ end - title=", title, 'authors=', authors)
def data2mi(self, item): """Converts a single metadata answer in the form of a dict to a MetadataInformation object""" mi = Metadata(_('Unknown')) # Regular metadata mi.title = item.get('title', None) mi.authors = item.get('authors', []) mi.publisher = item.get('publisher', None) if 'id' in item.keys(): mi.set_identifier(self.idkey, item['id']) if 'doi' in item.keys(): mi.set_identifier('doi', item['doi']) if 'isbn' in item.keys(): mi.set_identifier('isbn', item['isbn']) if 'updated' in item.keys(): mi.pubdate = parse_date(item['updated'], assume_utc=True) if 'series' in item.keys(): mi.series = item['series'] mi.series_index = self.format_series_index(item.get('series_index'), None) if 'year' in item.keys(): mi.pubdate = parse_date(item['year'], assume_utc=True) if 'abstract' in item.keys(): mi.comments = self.format_abstract(item['abstract']) if 'language' in item.keys(): mi.language = item['language'] if 'journal' in item.keys(): mi.series = item['journal'] mi.series_index = self.format_series_index(item.get('volume'), item.get('number')) if 'subject' in item.keys(): tags = set([]) for s in item['subject']: tags.update(msc_tags(s)) tags.update(arxiv_tags(s)) mi.tags = list(sorted(tags)) return mi
def un_serialize_schedule(self, recipe): for x in recipe.iterdescendants(): if 'schedule' in x.tag: sch, typ = x.text, x.get('type') if typ == 'interval': sch = float(sch) elif typ == 'day/time': sch = list(map(int, sch.split(':'))) elif typ in ('days_of_week', 'days_of_month'): parts = sch.split(':') days = list(map(int, [x.strip() for x in parts[0].split(',')])) sch = [days, int(parts[1]), int(parts[2])] return typ, sch, parse_date(recipe.get('last_downloaded'))
def read_serialized_metadata(book_id, data): from calibre.ebooks.metadata.opf2 import OPF from calibre.utils.date import parse_date mi = OPF(data['opf'], try_to_guess_cover=False, populate_spine=False, basedir=os.path.dirname(data['opf'])).to_book_metadata() try: mi.last_modified = parse_date(data['last_modified']) except: pass mi.cover, mi.cover_data = None, (None, None) cdata = None if 'cover' in data: with lopen(data['cover'], 'rb') as f: cdata = f.read() return mi, cdata
def do_set_metadata(opts, mi, stream, stream_type): mi = MetaInformation(mi) for x in ('guide', 'toc', 'manifest', 'spine'): setattr(mi, x, None) from_opf = getattr(opts, 'from_opf', None) if from_opf is not None: from calibre.ebooks.metadata.opf2 import OPF opf_mi = OPF(open(from_opf, 'rb')).to_book_metadata() mi.smart_update(opf_mi) for pref in config().option_set.preferences: if pref.name in ('to_opf', 'from_opf', 'authors', 'title_sort', 'author_sort', 'get_cover', 'cover', 'tags', 'lrf_bookid', 'identifiers'): continue val = getattr(opts, pref.name, None) if val is not None: setattr(mi, pref.name, val) if getattr(opts, 'authors', None) is not None: mi.authors = string_to_authors(opts.authors) mi.author_sort = authors_to_sort_string(mi.authors) if getattr(opts, 'author_sort', None) is not None: mi.author_sort = opts.author_sort if getattr(opts, 'title_sort', None) is not None: mi.title_sort = opts.title_sort elif getattr(opts, 'title', None) is not None: mi.title_sort = title_sort(opts.title) if getattr(opts, 'tags', None) is not None: mi.tags = [t.strip() for t in opts.tags.split(',')] if getattr(opts, 'series', None) is not None: mi.series = opts.series.strip() if getattr(opts, 'series_index', None) is not None: mi.series_index = float(opts.series_index.strip()) if getattr(opts, 'pubdate', None) is not None: mi.pubdate = parse_date(opts.pubdate, assume_utc=False, as_utc=False) if getattr(opts, 'identifiers', None): val = {k.strip():v.strip() for k, v in (x.partition(':')[0::2] for x in opts.identifiers)} if val: orig = mi.get_identifiers() orig.update(val) val = {k:v for k, v in iteritems(orig) if k and v} mi.set_identifiers(val) if getattr(opts, 'cover', None) is not None: ext = os.path.splitext(opts.cover)[1].replace('.', '').upper() mi.cover_data = (ext, open(opts.cover, 'rb').read()) with force_identifiers: set_metadata(stream, mi, stream_type)
def _c_convert_timestamp(val): if not val: return None try: ret = _c_speedup.parse_date(val.strip()) except: ret = None if ret is None: return parse_date(val, as_utc=False) year, month, day, hour, minutes, seconds, tzsecs = ret try: return datetime(year, month, day, hour, minutes, seconds, tzinfo=tzoffset(None, tzsecs)).astimezone(local_tz) except OverflowError: return UNDEFINED_DATE.astimezone(local_tz)
def test_datetime(self): # {{{ ' Test the reading of datetimes stored in the db ' from calibre.utils.date import parse_date from calibre.db.tables import c_parse, UNDEFINED_DATE, _c_speedup # First test parsing of string to UTC time for raw in ('2013-07-22 15:18:29+05:30', ' 2013-07-22 15:18:29+00:00', '2013-07-22 15:18:29', '2003-09-21 23:30:00-06:00'): self.assertTrue(_c_speedup(raw)) ctime = c_parse(raw) pytime = parse_date(raw, assume_utc=True) self.assertEqual(ctime, pytime) self.assertEqual(c_parse(2003).year, 2003) for x in (None, '', 'abc'): self.assertEqual(UNDEFINED_DATE, c_parse(x))
def _py_convert_timestamp(val): if val: tzsecs = 0 try: sign = {'+':1, '-':-1}.get(val[-6], None) if sign is not None: tzsecs = 60*((int(val[-5:-3])*60 + int(val[-2:])) * sign) year = int(val[0:4]) month = int(val[5:7]) day = int(val[8:10]) hour = int(val[11:13]) min = int(val[14:16]) sec = int(val[17:19]) return datetime(year, month, day, hour, min, sec, tzinfo=tzoffset(None, tzsecs)) except: pass return parse_date(val, as_utc=False) return None
def convert_kobo_date(kobo_date): """ KoBo stores dates as a timestamp string. The exact format has changed with firmware and what part of the firmware writes it. The following is overkill, but it handles all the formats I have seen. """ from calibre.utils.date import utc_tz, local_tz from calibre.devices.usbms.driver import debug_print # debug_print("convert_kobo_date - start - kobo_date={0}'".format(kobo_date)) try: converted_date = datetime.datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S+00:00") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date)) except Exception as e: # debug_print("convert_kobo_date - exception={0}'".format(e)) try: converted_date = datetime.datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%SZ") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%SZ' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d") # converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date)) except: try: from calibre.utils.date import parse_date converted_date = parse_date(kobo_date)#, assume_utc=True) # debug_print("convert_kobo_date - parse_date - kobo_date={0}'".format(kobo_date)) except: converted_date = time.gmtime() debug_print("convert_kobo_date - could not convert, using current time - kobo_date={0}'".format(kobo_date)) converted_date = converted_date.replace(tzinfo=utc_tz).astimezone(local_tz) return converted_date
def schedule_recipe(self, recipe, schedule_type, schedule, last_downloaded=None): with self.lock: for x in list(self.iter_recipes()): if x.get('id', False) == recipe.get('id'): ld = x.get('last_downloaded', None) if ld and last_downloaded is None: try: last_downloaded = parse_date(ld) except: pass self.root.remove(x) break if last_downloaded is None: last_downloaded = fromordinal(1) sr = E.scheduled_recipe({ 'id' : recipe.get('id'), 'title': recipe.get('title'), 'last_downloaded':isoformat(last_downloaded), }, self.serialize_schedule(schedule_type, schedule)) self.root.append(sr) self.write_scheduler_file()
def __init__(self, devs, blacklist): QWidget.__init__(self) self.l = l = QVBoxLayout() self.setLayout(l) self.la = la = QLabel('<p>'+_( '''Select the devices to be <b>ignored</b>. calibre <b>will not</b> connect to devices with a checkmark next to their names.''')) la.setWordWrap(True) l.addWidget(la) self.f = f = QListWidget(self) l.addWidget(f) devs = [(snum, (x[0], parse_date(x[1]))) for snum, x in devs.iteritems()] for dev, x in sorted(devs, key=lambda x:x[1][1], reverse=True): name = x[0] name = '%s [%s]'%(name, dev) item = QListWidgetItem(name, f) item.setData(Qt.UserRole, dev) item.setFlags(Qt.ItemIsEnabled|Qt.ItemIsUserCheckable|Qt.ItemIsSelectable) item.setCheckState(Qt.Checked if dev in blacklist else Qt.Unchecked)
def convert_kobo_date(kobo_date): """ KoBo stores dates as a timestamp string. The exact format has changed with firmware and what part of the firmware writes it. The following is overkill, but it handles all the formats I have seen. """ from calibre.utils.date import utc_tz try: converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S.%f") converted_date = datetime.strptime(kobo_date[0:19], "%Y-%m-%dT%H:%M:%S") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S.%f' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date, "%Y-%m-%dT%H:%M:%S%+00:00") # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S+00:00' - kobo_date=%s' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%dT%H:%M:%S") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%dT%H:%M:%S' - kobo_date={0}'".format(kobo_date)) except: try: converted_date = datetime.strptime(kobo_date.split('+')[0], "%Y-%m-%d") converted_date = converted_date.replace(tzinfo=utc_tz) # debug_print("convert_kobo_date - '%Y-%m-%d' - kobo_date={0}'".format(kobo_date)) except: try: from calibre.utils.date import parse_date converted_date = parse_date(kobo_date, assume_utc=True) # debug_print("convert_kobo_date - parse_date - kobo_date=%s' - kobo_date={0}'".format(kobo_date)) except: # try: # converted_date = time.gmtime(os.path.getctime(self.path)) # debug_print("convert_kobo_date - time.gmtime(os.path.getctime(self.path)) - kobo_date={0}'".format(kobo_date)) # except: converted_date = time.gmtime() debug_print("convert_kobo_date - time.gmtime() - kobo_date={0}'".format(kobo_date)) return converted_date
def c_parse(val): try: year, month, day, hour, minutes, seconds, tzsecs = _c_speedup(val) except (AttributeError, TypeError): # If a value like 2001 is stored in the column, apsw will return it as # an int if isinstance(val, (int, float)): return datetime(int(val), 1, 3, tzinfo=utc_tz) except: pass else: try: ans = datetime(year, month, day, hour, minutes, seconds, tzinfo=utc_tz) if tzsecs is not 0: ans -= timedelta(seconds=tzsecs) except OverflowError: ans = UNDEFINED_DATE return ans try: return parse_date(val, as_utc=True, assume_utc=True) except ValueError: return UNDEFINED_DATE
def opts_to_mi(self, mi): from calibre.ebooks.metadata import string_to_authors for x in self.metadata_option_names: val = getattr(self.opts, x, None) if val is not None: if x == 'authors': val = string_to_authors(val) elif x == 'tags': val = [i.strip() for i in val.split(',')] elif x in ('rating', 'series_index'): try: val = float(val) except ValueError: self.log.warn(_('Values of series index and rating must' ' be numbers. Ignoring'), val) continue elif x in ('timestamp', 'pubdate'): try: val = parse_date(val, assume_utc=x=='pubdate') except: self.log.exception(_('Failed to parse date/time') + ' ' + unicode(val)) continue setattr(mi, x, val)
def string_to_datetime(src): from calibre.utils.date import parse_date if src == "None": return None return parse_date(src)
def parse(self, raw, desc_raw): from calibre.ebooks.metadata.book.base import Metadata from calibre.utils.date import parse_date, utcnow import json root = parse_html(raw.decode('gb18030')) title = root.xpath('//*[@id="name"]/div[1]/text()') title = title[0].strip() authors = [] for i in root.xpath('//*[@id="p-author"]/a'): authors.append(i.text.strip()) mi = Metadata(title, authors) information = root.xpath('//*[@id="parameter2"]/li') info = dict() for i in information: tmp = etree.tostring(i, method='text', encoding='utf-8').split(u':') info[tmp[0].strip()] = tmp[1].strip() # Identifiers mi.identifiers = self.plugin.identifiers mi.identifiers['jd'] = self.sku isbn = info['ISBN'] self.log.error(isbn) if isbn: mi.isbn = isbn self.plugin.cache_isbn_to_identifier(isbn, self.sku) mi.identifiers['isbn'] = isbn # Publisher mi.publisher = info.get(u'出版社') # Pubdate pubdate = info.get(u'出版时间') if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: self.log.error('Failed to parse pubdate %r' % pubdate) # Series mi.series = info.get(u'丛书名') img = root.xpath('//*[@id="spec-n1"]/img') cover = img[0].get('src') if cover: if not cover.startswith('http'): cover = 'https:' + cover self.plugin.cache_identifier_to_cover_url(self.sku, cover) self.log.error(cover) mi.has_cover = self.plugin.cached_identifier_to_cover_url( self.sku) is not None # Comments # showdesc({"date":1583588455348,"content":" ... "}) try: desc = json.loads(desc_raw[9:-1].decode('gb18030')) desc_root = parse_html(desc['content']) div = desc_root.xpath( '//*[@id="detail-tag-id-3"]/div[2]/div/text()') comments = div[0] mi.comments = comments finally: return mi
def get_dates_matches(self, location, query, candidates): matches = set([]) if len(query) < 2: return matches if location == 'date': location = 'timestamp' loc = self.field_metadata[location]['rec_index'] if query == 'false': for id_ in candidates: item = self._data[id_] if item is None: continue v = item[loc] if isinstance(v, (bytes, unicode_type)): v = parse_date(v) if v is None or v <= UNDEFINED_DATE: matches.add(item[0]) return matches if query == 'true': for id_ in candidates: item = self._data[id_] if item is None: continue v = item[loc] if isinstance(v, (bytes, unicode_type)): v = parse_date(v) if v is not None and v > UNDEFINED_DATE: matches.add(item[0]) return matches relop = None for k in self.date_search_relops.keys(): if query.startswith(k): (p, relop) = self.date_search_relops[k] query = query[p:] if relop is None: (p, relop) = self.date_search_relops['='] if query in self.local_today: qd = now() field_count = 3 elif query in self.local_yesterday: qd = now() - timedelta(1) field_count = 3 elif query in self.local_thismonth: qd = now() field_count = 2 elif query.endswith(self.local_daysago) or query.endswith( self.untrans_daysago): num = query[0:-( self.local_daysago_len if query. endswith(self.local_daysago) else self.untrans_daysago_len)] try: qd = now() - timedelta(int(num)) except: raise ParseException( _('Number conversion error: {0}').format(num)) field_count = 3 else: try: qd = parse_date(query, as_utc=False) except: raise ParseException( _('Date conversion error: {0}').format(query)) if '-' in query: field_count = query.count('-') + 1 else: field_count = query.count('/') + 1 for id_ in candidates: item = self._data[id_] if item is None or item[loc] is None: continue v = item[loc] if isinstance(v, (bytes, unicode_type)): v = parse_date(v) if relop(v, qd, field_count): matches.add(item[0]) return matches
def adapt_datetime(x): if isinstance(x, (unicode, bytes)): x = parse_date(x, assume_utc=False, as_utc=False) if x and is_date_undefined(x): x = UNDEFINED_DATE return x
def metadata_from_filename(name, pat=None): if isbytestring(name): name = name.decode(filesystem_encoding, 'replace') name = name.rpartition('.')[0] mi = MetaInformation(None, None) if pat is None: pat = re.compile(prefs.get('filename_pattern')) name = name.replace('_', ' ') match = pat.search(name) if match is not None: try: mi.title = match.group('title') except IndexError: pass try: au = match.group('author') aus = string_to_authors(au) if aus: mi.authors = aus if prefs['swap_author_names'] and mi.authors: def swap(a): if ',' in a: parts = a.split(',', 1) else: parts = a.split(None, 1) if len(parts) > 1: t = parts[-1] parts = parts[:-1] parts.insert(0, t) return ' '.join(parts) mi.authors = [swap(x) for x in mi.authors] except (IndexError, ValueError): pass try: mi.series = match.group('series') except IndexError: pass try: si = match.group('series_index') mi.series_index = float(si) except (IndexError, ValueError, TypeError): pass try: si = match.group('isbn') mi.isbn = si except (IndexError, ValueError): pass try: publisher = match.group('publisher') mi.publisher = publisher except (IndexError, ValueError): pass try: pubdate = match.group('published') if pubdate: from calibre.utils.date import parse_date mi.pubdate = parse_date(pubdate) except: pass if mi.is_null('title'): mi.title = name return mi
def __init__(self, prefix, lpath, title=None, authors=None, mime=None, date=None, ContentType=None, thumbnail_name=None, size=None, other=None): from calibre.utils.date import parse_date # debug_print('Book::__init__ - title=', title) show_debug = title is not None and title.lower().find("xxxxx") >= 0 if other is not None: other.title = title other.published_date = date if show_debug: debug_print("Book::__init__ - title=", title, 'authors=', authors) debug_print("Book::__init__ - other=", other) super(Book, self).__init__(prefix, lpath, size, other) if title is not None and len(title) > 0: self.title = title if authors is not None and len(authors) > 0: self.authors_from_string(authors) if self.author_sort is None or self.author_sort == "Unknown": self.author_sort = author_to_author_sort(authors) self.mime = mime self.size = size # will be set later if None if ContentType == '6' and date is not None: try: self.datetime = time.strptime(date, "%Y-%m-%dT%H:%M:%S.%f") except: try: self.datetime = time.strptime( date.split('+')[0], "%Y-%m-%dT%H:%M:%S") except: try: self.datetime = time.strptime( date.split('+')[0], "%Y-%m-%d") except: try: self.datetime = parse_date( date, assume_utc=True).timetuple() except: try: self.datetime = time.gmtime( os.path.getctime(self.path)) except: self.datetime = time.gmtime() self.kobo_metadata = Metadata(title, self.authors) self.contentID = None self.current_shelves = [] self.kobo_collections = [] self.can_put_on_shelves = True self.kobo_series = None self.kobo_series_number = None # Kobo stores the series number as string. And it can have a leading "#". self.kobo_subtitle = None if thumbnail_name is not None: self.thumbnail = ImageWrapper(thumbnail_name) if show_debug: debug_print("Book::__init__ end - self=", self) debug_print("Book::__init__ end - title=", title, 'authors=', authors)
def safe_parse_date(raw): if raw: try: return parse_date(raw) except Exception: pass
def accept(self): col = unicode_type(self.column_name_box.text()).strip() if not col: return self.simple_error('', _('No lookup name was provided')) if col.startswith('#'): col = col[1:] if re.match(r'^\w*$', col) is None or not col[0].isalpha() or col.lower() != col: return self.simple_error( '', _('The lookup name must contain only ' 'lower case letters, digits and underscores, and start with a letter' )) if col.endswith('_index'): return self.simple_error( '', _('Lookup names cannot end with _index, ' 'because these names are reserved for the index of a series column.' )) col_heading = unicode_type(self.column_heading_box.text()).strip() coldef = self.column_types[self.column_type_box.currentIndex()] col_type = coldef['datatype'] if col_type[0] == '*': col_type = col_type[1:] is_multiple = True else: is_multiple = False if not col_heading: return self.simple_error('', _('No column heading was provided')) db = self.parent.gui.library_view.model().db key = db.field_metadata.custom_field_prefix + col bad_col = False if key in self.parent.custcols: if not self.editing_col or \ self.parent.custcols[key]['colnum'] != self.orig_column_number: bad_col = True if bad_col: return self.simple_error( '', _('The lookup name %s is already used') % col) bad_head = False for t in self.parent.custcols: if self.parent.custcols[t]['name'] == col_heading: if not self.editing_col or \ self.parent.custcols[t]['colnum'] != self.orig_column_number: bad_head = True for t in self.standard_colheads: if self.standard_colheads[t] == col_heading: bad_head = True if bad_head: return self.simple_error( '', _('The heading %s is already used') % col_heading) display_dict = {} default_val = (unicode_type(self.default_value.text()).strip() if col_type != 'composite' else None) if col_type == 'datetime': if unicode_type(self.format_box.text()).strip(): display_dict = { 'date_format': unicode_type(self.format_box.text()).strip() } else: display_dict = {'date_format': None} if default_val: if default_val == _('Now'): display_dict['default_value'] = 'now' else: try: tv = parse_date(default_val) except: tv = UNDEFINED_DATE if tv == UNDEFINED_DATE: return self.simple_error( _('Invalid default value'), _('The default value must be "Now" or a date')) display_dict['default_value'] = default_val elif col_type == 'composite': if not unicode_type(self.composite_box.text()).strip(): return self.simple_error( '', _('You must enter a template for ' 'composite columns')) display_dict = { 'composite_template': unicode_type(self.composite_box.text()).strip(), 'composite_sort': ['text', 'number', 'date', 'bool'][self.composite_sort_by.currentIndex()], 'make_category': self.composite_make_category.isChecked(), 'contains_html': self.composite_contains_html.isChecked(), } elif col_type == 'enumeration': if not unicode_type(self.enum_box.text()).strip(): return self.simple_error( '', _('You must enter at least one ' 'value for enumeration columns')) l = [ v.strip() for v in unicode_type(self.enum_box.text()).split(',') if v.strip() ] l_lower = [v.lower() for v in l] for i, v in enumerate(l_lower): if v in l_lower[i + 1:]: return self.simple_error( '', _('The value "{0}" is in the ' 'list more than once, perhaps with different case'). format(l[i])) c = unicode_type(self.enum_colors.text()) if c: c = [ v.strip() for v in unicode_type(self.enum_colors.text()).split(',') ] else: c = [] if len(c) != 0 and len(c) != len(l): return self.simple_error( '', _('The colors box must be empty or ' 'contain the same number of items as the value box')) for tc in c: if tc not in QColor.colorNames() and not re.match( "#(?:[0-9a-f]{3}){1,4}", tc, re.I): return self.simple_error( '', _('The color {0} is unknown').format(tc)) display_dict = {'enum_values': l, 'enum_colors': c} if default_val: if default_val not in l: return self.simple_error( _('Invalid default value'), _('The default value must be one of the permitted values' )) display_dict['default_value'] = default_val elif col_type == 'text' and is_multiple: display_dict = {'is_names': self.is_names.isChecked()} elif col_type in ['int', 'float']: if unicode_type(self.format_box.text()).strip(): display_dict = { 'number_format': unicode_type(self.format_box.text()).strip() } else: display_dict = {'number_format': None} if default_val: try: if col_type == 'int': msg = _('The default value must be an integer') tv = int(default_val) display_dict['default_value'] = tv else: msg = _('The default value must be a real number') tv = float(default_val) display_dict['default_value'] = tv except: return self.simple_error(_('Invalid default value'), msg) elif col_type == 'comments': display_dict['heading_position'] = unicode_type( self.comments_heading_position.currentData()) display_dict['interpret_as'] = unicode_type( self.comments_type.currentData()) elif col_type == 'rating': half_stars = bool(self.allow_half_stars.isChecked()) display_dict['allow_half_stars'] = half_stars if default_val: try: tv = int((float(default_val) if half_stars else int(default_val)) * 2) except: tv = -1 if tv < 0 or tv > 10: if half_stars: return self.simple_error( _('Invalid default value'), _('The default value must be a real number between 0 and 5.0' )) else: return self.simple_error( _('Invalid default value'), _('The default value must be an integer between 0 and 5' )) display_dict['default_value'] = tv elif col_type == 'bool': if default_val: tv = {_('Yes'): True, _('No'): False}.get(default_val, None) if tv is None: return self.simple_error( _('Invalid default value'), _('The default value must be "Yes" or "No"')) display_dict['default_value'] = tv if col_type in ['text', 'composite', 'enumeration' ] and not is_multiple: display_dict['use_decorations'] = self.use_decorations.checkState() if default_val and 'default_value' not in display_dict: display_dict['default_value'] = default_val display_dict['description'] = self.description_box.text().strip() if not self.editing_col: self.parent.custcols[key] = { 'label': col, 'name': col_heading, 'datatype': col_type, 'display': display_dict, 'normalized': None, 'colnum': None, 'is_multiple': is_multiple, } self.parent.cc_column_key = key else: self.parent.custcols[self.orig_column_name]['label'] = col self.parent.custcols[self.orig_column_name]['name'] = col_heading # Remove any previous default value self.parent.custcols[self.orig_column_name]['display'].pop( 'default_value', None) self.parent.custcols[self.orig_column_name]['display'].update( display_dict) self.parent.custcols[self.orig_column_name]['*edited'] = True self.parent.custcols[self.orig_column_name]['*must_restart'] = True self.parent.cc_column_key = key QDialog.accept(self)
def get_metadata_(src, encoding=None): # Meta data definitions as in # https://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags, meta_tags, meta_tag_ids, title_tag = parse_metadata(src) def get_all(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = [x.strip() for x in ans if x.strip()] if not ans: ans = None return ans def get(field): ans = get_all(field) if ans: ans = ans[0] return ans # Title title = get('title') or title_tag.strip() or _('Unknown') # Author authors = authors_to_string(get_all('authors')) or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title, string_to_authors(authors)) # Single-value text fields for field in ('publisher', 'isbn'): val = get(field) if val: setattr(mi, field, val) # Multi-value text fields for field in ('languages', ): val = get_all(field) if val: setattr(mi, field, val) # HTML fields for field in ('comments', ): val = get(field) if val: setattr( mi, field, val.replace('&', '&').replace('<', '<').replace( '>', '>').replace('"', '"').replace("'", ''')) # Date fields for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 10: mi.rating = 0 except: pass # TAGS tags = get_all('tags') if tags: tags = [x.strip() for s in tags for x in s.split(',') if x.strip()] if tags: mi.tags = tags # IDENTIFIERS for (k, v) in iteritems(meta_tag_ids): v = [x.strip() for x in v if x.strip()] if v: mi.set_identifier(k, v[0]) return mi
def string_to_datetime(src): if src == "None": return None return parse_date(src)
def date_sort_key(self, val): try: val = self._filter_date(parse_date(val)) except (TypeError, ValueError, AttributeError, KeyError): val = UNDEFINED_DATE return val
def __call__(self, query, field_iter): matches = set() if len(query) < 2: return matches if query == 'false': for v, book_ids in field_iter(): if isinstance(v, (str, unicode)): v = parse_date(v) if v is None or v <= UNDEFINED_DATE: matches |= book_ids return matches if query == 'true': for v, book_ids in field_iter(): if isinstance(v, (str, unicode)): v = parse_date(v) if v is not None and v > UNDEFINED_DATE: matches |= book_ids return matches relop = None for k, op in self.operators.iteritems(): if query.startswith(k): p, relop = op query = query[p:] if relop is None: relop = self.operators['='][-1] if query in self.local_today: qd = now() field_count = 3 elif query in self.local_yesterday: qd = now() - timedelta(1) field_count = 3 elif query in self.local_thismonth: qd = now() field_count = 2 else: m = self.daysago_pat.search(query) if m is not None: num = query[:-len(m.group(1))] try: qd = now() - timedelta(int(num)) except: raise ParseException( _('Number conversion error: {0}').format(num)) field_count = 3 else: try: qd = parse_date(query, as_utc=False) except: raise ParseException( _('Date conversion error: {0}').format(query)) if '-' in query: field_count = query.count('-') + 1 else: field_count = query.count('/') + 1 for v, book_ids in field_iter(): if isinstance(v, (str, unicode)): v = parse_date(v) if v is not None and relop(dt_as_local(v), qd, field_count): matches |= book_ids return matches
def parse_exported_highlights(self, raw, log_failure=True): """ Extract highlights from pasted Annotations summary, add them to selected book in calibre library Construct a BookStruct object with the book's metadata. Starred items are minimally required. BookStruct properties: *active: [True|False] *author: "John Smith" author_sort: (if known) *book_id: an int uniquely identifying the book. Highlights are associated with books through book_id genre: "Fiction" (if known) *title: "The Story of John Smith" title_sort: "Story of John Smith, The" (if known) uuid: Calibre's uuid for this book, if known Construct an AnnotationStruct object with the highlight's metadata. Starred items are minimally required. Dashed items (highlight_text and note_text) may be one or both. AnnotationStruct properties: annotation_id: an int uniquely identifying the annotation *book_id: The book this annotation is associated with highlight_color: [Blue|Gray|Green|Pink|Purple|Underline|Yellow] -highlight_text: A list of paragraphs constituting the highlight last_modification: The timestamp of the annotation location: location of highlight in the book -note_text: A list of paragraphs constituting the note *timestamp: Unique timestamp of highlight's creation/modification time """ # Create the annotations, books table as needed self.annotations_db = "%s_imported_annotations" % self.app_name_ self.create_annotations_table(self.annotations_db) self.books_db = "%s_imported_books" % self.app_name_ self.create_books_table(self.books_db) self.annotated_book_list = [] self.selected_books = None # Generate the book metadata from the selected book row = self.opts.gui.library_view.currentIndex() book_id = self.opts.gui.library_view.model().id(row) db = self.opts.gui.current_db mi = db.get_metadata(book_id, index_is_id=True) try: lines = raw.split('\n') if len(lines) < 5: raise AnnotationsException("Invalid annotations summary") index = 0 annotations = {} # Get the title, author, publisher from the first three lines title = lines[index] index += 1 author = lines[index] index += 1 publisher = lines[index] index += 1 # Next line should be the first timestamp/location while index < len(lines): tsl = re.match(r'^(?P<timestamp>.*) \((?P<location>Page .*)\)', lines[index]) if tsl: ts = tsl.group('timestamp') isoformat = parse_date(ts, as_utc=False) isoformat = isoformat.replace(hour=12) timestamp = mktime(isoformat.timetuple()) while timestamp in annotations: timestamp += 60 location = tsl.group('location') index += 1 # Continue with highlight highlight_text = lines[index] index += 1 # Next line is either Note: or a new tsl note = re.match(r'^Notes: (?P<note_text>.*)', lines[index]) note_text = None if note: note_text = note.group('note_text') index += 1 if re.match(r'^(?P<timestamp>.*) \((?P<location>Page .*)\)', lines[index]): # New note - store the old one, continue ann = AnnotationStruct() ann.book_id = mi.id ann.annotation_id = index ann.highlight_color = 'Yellow' ann.highlight_text = highlight_text ann.location = location ann.location_sort = "%05d" % int(re.match(r'^Page (?P<page>\d+).*$', location).group('page')) ann.note_text = note_text ann.last_modification = timestamp # Add annotation to db annotations[timestamp] = ann continue else: # Store the last one ann = AnnotationStruct() ann.book_id = mi.id ann.annotation_id = index ann.highlight_color = 'Yellow' ann.highlight_text = highlight_text ann.location = location ann.location_sort = "%05d" % int(re.match(r'^Page (?P<page>\d+).*$', location).group('page')) ann.note_text = note_text ann.last_modification = timestamp annotations[timestamp] = ann break except: if log_failure: self._log(" unable to parse %s Annotations" % self.app_name) self._log("{:~^80}".format(" Imported Annotation summary ")) self._log(raw) self._log("{:~^80}".format(" end imported Annotations summary ")) import traceback traceback.print_exc() msg = ('Unable to parse Annotation summary from %s. ' % self.app_name + 'Paste entire contents of emailed summary.') MessageBox(MessageBox.WARNING, 'Error importing annotations', msg, show_copy_button=False, parent=self.opts.gui).exec_() self._log_location("WARNING: %s" % msg) return False # Populate a BookStruct book_mi = BookStruct() book_mi.active = True book_mi.author = author book_mi.book_id = mi.id book_mi.title = title book_mi.uuid = None book_mi.last_update = time.mktime(time.localtime()) book_mi.reader_app = self.app_name book_mi.cid = mi.id book_mi.annotations = len(annotations) # Add book to books_db self.add_to_books_db(self.books_db, book_mi) self.annotated_book_list.append(book_mi) # Add the annotations for timestamp in sorted(annotations.keys()): self.add_to_annotations_db(self.annotations_db, annotations[timestamp]) self.update_book_last_annotation(self.books_db, timestamp, mi.id) self.opts.pb.increment() self.update_book_last_annotation(self.books_db, timestamp, mi.id) # Update the timestamp self.update_timestamp(self.annotations_db) self.update_timestamp(self.books_db) self.commit() # Return True if successful return True
def get_dates_matches(self, location, query, candidates): matches = set([]) if len(query) < 2: return matches if location == 'date': location = 'timestamp' loc = self.field_metadata[location]['rec_index'] if query == 'false': for id_ in candidates: item = self._data[id_] if item is None: continue v = item[loc] if isinstance(v, (str, unicode)): v = parse_date(v) if v is None or v <= UNDEFINED_DATE: matches.add(item[0]) return matches if query == 'true': for id_ in candidates: item = self._data[id_] if item is None: continue v = item[loc] if isinstance(v, (str, unicode)): v = parse_date(v) if v is not None and v > UNDEFINED_DATE: matches.add(item[0]) return matches relop = None for k in self.date_search_relops.keys(): if query.startswith(k): (p, relop) = self.date_search_relops[k] query = query[p:] if relop is None: (p, relop) = self.date_search_relops['='] if query in self.local_today: qd = now() field_count = 3 elif query in self.local_yesterday: qd = now() - timedelta(1) field_count = 3 elif query in self.local_thismonth: qd = now() field_count = 2 elif query.endswith(self.local_daysago) or query.endswith(self.untrans_daysago): num = query[0:-(self.local_daysago_len if query.endswith(self.local_daysago) else self.untrans_daysago_len)] try: qd = now() - timedelta(int(num)) except: raise ParseException(_('Number conversion error: {0}').format(num)) field_count = 3 else: try: qd = parse_date(query, as_utc=False) except: raise ParseException(_('Date conversion error: {0}').format(query)) if '-' in query: field_count = query.count('-') + 1 else: field_count = query.count('/') + 1 for id_ in candidates: item = self._data[id_] if item is None or item[loc] is None: continue v = item[loc] if isinstance(v, (str, unicode)): v = parse_date(v) if relop(v, qd, field_count): matches.add(item[0]) return matches
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: if title.startswith(r'\376\377'): # corrupted XMP packet generated by Nitro PDF. See # https://bugs.launchpad.net/calibre/+bug/1541981 raise ValueError( 'Corrupted XMP metadata packet detected, probably generated by Nitro PDF' ) mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = [au for aus in authors for au in string_to_authors(aus)] tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = safe_parse_date(first_simple('//xmp:MetadataDate', root)) mod = safe_parse_date(first_simple('//xmp:ModifyDate', root)) fd = more_recent(md, mod) if fd is not None: mi.metadata_date = fd rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = list(filter(None, map(canonicalize_lang, languages))) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple(f'//{namespace}:{scheme}', root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in iteritems({ 'doi': check_doi, 'isbn': check_isbn }): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def get_metadata(stream, extract_cover=True): zin = zipfile.ZipFile(stream, 'r') odfs = odfmetaparser() parser = xml.sax.make_parser() parser.setFeature(xml.sax.handler.feature_namespaces, True) parser.setFeature(xml.sax.handler.feature_external_ges, False) parser.setContentHandler(odfs) content = zin.read('meta.xml') parser.parse(StringIO(content)) data = odfs.seenfields mi = MetaInformation(None, []) if 'title' in data: mi.title = data['title'] if data.get('initial-creator', '').strip(): mi.authors = string_to_authors(data['initial-creator']) elif 'creator' in data: mi.authors = string_to_authors(data['creator']) if 'description' in data: mi.comments = data['description'] if 'language' in data: mi.language = data['language'] if data.get('keywords', ''): mi.tags = [x.strip() for x in data['keywords'].split(',') if x.strip()] opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata','') == 'true': # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except ValueError: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', 'false') == 'true' if not opfnocover: try: read_cover(stream, zin, mi, opfmeta, extract_cover) except: pass # Do not let an error reading the cover prevent reading other data return mi
def metadata_from_xmp_packet(raw_bytes): root = parse_xmp_packet(raw_bytes) mi = Metadata(_('Unknown')) title = first_alt('//dc:title', root) if title: mi.title = title authors = multiple_sequences('//dc:creator', root) if authors: mi.authors = authors tags = multiple_sequences('//dc:subject', root) or multiple_sequences( '//pdf:Keywords', root) if tags: mi.tags = tags comments = first_alt('//dc:description', root) if comments: mi.comments = comments publishers = multiple_sequences('//dc:publisher', root) if publishers: mi.publisher = publishers[0] try: pubdate = parse_date(first_sequence('//dc:date', root) or first_simple('//xmp:CreateDate', root), assume_utc=False) except: pass else: mi.pubdate = pubdate bkp = first_simple('//xmp:CreatorTool', root) if bkp: mi.book_producer = bkp md = first_simple('//xmp:MetadataDate', root) if md: try: mi.metadata_date = parse_date(md) except: pass rating = first_simple('//calibre:rating', root) if rating is not None: try: rating = float(rating) if 0 <= rating <= 10: mi.rating = rating except (ValueError, TypeError): pass series, series_index = read_series(root) if series: mi.series, mi.series_index = series, series_index for x in ('title_sort', 'author_sort'): for elem in XPath('//calibre:' + x)(root): val = read_simple_property(elem) if val: setattr(mi, x, val) break for x in ('author_link_map', 'user_categories'): val = first_simple('//calibre:' + x, root) if val: try: setattr(mi, x, json.loads(val)) except: pass languages = multiple_sequences('//dc:language', root) if languages: languages = filter(None, map(canonicalize_lang, languages)) if languages: mi.languages = languages identifiers = {} for xmpid in XPath('//xmp:Identifier')(root): for scheme, value in read_xmp_identifers(xmpid): if scheme and value: identifiers[scheme.lower()] = value for namespace in ('prism', 'pdfx'): for scheme in KNOWN_ID_SCHEMES: if scheme not in identifiers: val = first_simple('//%s:%s' % (namespace, scheme), root) scheme = scheme.lower() if scheme == 'isbn': val = check_isbn(val) elif scheme == 'doi': val = check_doi(val) if val: identifiers[scheme] = val # Check Dublin Core for recognizable identifier types for scheme, check_func in { 'doi': check_doi, 'isbn': check_isbn }.iteritems(): if scheme not in identifiers: val = check_func(first_simple('//dc:identifier', root)) if val: identifiers['doi'] = val if identifiers: mi.set_identifiers(identifiers) read_user_metadata(mi, root) return mi
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) # Author names in Amazon MOBI files are usually in LN, FN format, # try to detect and auto-correct that. m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip()) if m is not None: if tweaks['author_sort_copy_method'] != 'copy': self.mi.authors.append(m.group(2) + ' ' + m.group(1)) else: self.mi.authors.append(m.group()) if self.mi.is_null('author_sort'): self.mi.author_sort = m.group() else: self.mi.authors.append(au) elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars(self.decode(content).strip()) elif idx == 109: self.mi.rights = clean_xml_chars(self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def sony_metadata(oeb): m = oeb.metadata title = short_title = str(m.title[0]) publisher = __appname__ + ' ' + __version__ try: pt = str(oeb.metadata.publication_type[0]) short_title = ':'.join(pt.split(':')[2:]) except: pass try: date = parse_date(str(m.date[0]), as_utc=False).strftime('%Y-%m-%d') except: date = strftime('%Y-%m-%d') try: language = str(m.language[0]).replace('_', '-') except: language = 'en' short_title = xml(short_title, True) metadata = SONY_METADATA.format(title=xml(title), short_title=short_title, publisher=xml(publisher), issue_date=xml(date), language=xml(language)) updated = strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime()) def cal_id(x): for k, v in x.attrib.items(): if k.endswith('scheme') and v == 'uuid': return True try: base_id = str(list(filter(cal_id, m.identifier))[0]) except: base_id = str(uuid4()) toc = oeb.toc if False and toc.depth() < 3: # Single section periodical # Disabled since I prefer the current behavior from calibre.ebooks.oeb.base import TOC section = TOC(klass='section', title=_('All articles'), href=oeb.spine[2].href) for x in toc: section.nodes.append(x) toc = TOC(klass='periodical', href=oeb.spine[2].href, title=str(oeb.metadata.title[0])) toc.nodes.append(section) entries = [] seen_titles = set() for i, section in enumerate(toc): if not section.href: continue secid = 'section%d'%i sectitle = section.title if not sectitle: sectitle = _('Unknown') d = 1 bsectitle = sectitle while sectitle in seen_titles: sectitle = bsectitle + ' ' + str(d) d += 1 seen_titles.add(sectitle) sectitle = xml(sectitle, True) secdesc = section.description if not secdesc: secdesc = '' secdesc = xml(secdesc) entries.append(SONY_ATOM_SECTION.format(title=sectitle, href=section.href, id=xml(base_id)+'/'+secid, short_title=short_title, desc=secdesc, updated=updated)) for j, article in enumerate(section): if not article.href: continue atitle = article.title btitle = atitle d = 1 while atitle in seen_titles: atitle = btitle + ' ' + str(d) d += 1 auth = article.author if article.author else '' desc = section.description if not desc: desc = '' aid = 'article%d'%j entries.append(SONY_ATOM_ENTRY.format( title=xml(atitle), author=xml(auth), updated=updated, desc=desc, short_title=short_title, section_title=sectitle, href=article.href, word_count=str(1), id=xml(base_id)+'/'+secid+'/'+aid )) atom = SONY_ATOM.format(short_title=short_title, entries='\n\n'.join(entries), updated=updated, id=xml(base_id)).encode('utf-8') return metadata, atom
def get_metadata(stream, extract_cover=True): whitespace = re.compile(r'\s+') def normalize(s): return whitespace.sub(' ', s).strip() with ZipFile(stream) as zf: meta = zf.read('meta.xml') root = fromstring(meta) def find(field): ns, tag = fields[field] ans = root.xpath('//ns0:{}'.format(tag), namespaces={'ns0': ns}) if ans: return normalize( tostring(ans[0], method='text', encoding='unicode', with_tail=False)).strip() mi = MetaInformation(None, []) title = find('title') if title: mi.title = title creator = find('initial-creator') or find('creator') if creator: mi.authors = string_to_authors(creator) desc = find('description') if desc: mi.comments = desc lang = find('language') if lang and canonicalize_lang(lang): mi.languages = [canonicalize_lang(lang)] kw = find('keyword') or find('keywords') if kw: mi.tags = [x.strip() for x in kw.split(',') if x.strip()] data = {} for tag in root.xpath('//ns0:user-defined', namespaces={'ns0': fields['user-defined'][0]}): name = (tag.get('{%s}name' % METANS) or '').lower() vtype = tag.get('{%s}value-type' % METANS) or 'string' val = tag.text if name and val: if vtype == 'boolean': val = val == 'true' data[name] = val opfmeta = False # we need this later for the cover opfnocover = False if data.get('opf.metadata'): # custom metadata contains OPF information opfmeta = True if data.get('opf.titlesort', ''): mi.title_sort = data['opf.titlesort'] if data.get('opf.authors', ''): mi.authors = string_to_authors(data['opf.authors']) if data.get('opf.authorsort', ''): mi.author_sort = data['opf.authorsort'] if data.get('opf.isbn', ''): isbn = check_isbn(data['opf.isbn']) if isbn is not None: mi.isbn = isbn if data.get('opf.publisher', ''): mi.publisher = data['opf.publisher'] if data.get('opf.pubdate', ''): mi.pubdate = parse_date(data['opf.pubdate'], assume_utc=True) if data.get('opf.identifiers'): try: mi.identifiers = json.loads(data['opf.identifiers']) except Exception: pass if data.get('opf.rating'): try: mi.rating = max(0, min(float(data['opf.rating']), 10)) except Exception: pass if data.get('opf.series', ''): mi.series = data['opf.series'] if data.get('opf.seriesindex', ''): try: mi.series_index = float(data['opf.seriesindex']) except Exception: mi.series_index = 1.0 if data.get('opf.language', ''): cl = canonicalize_lang(data['opf.language']) if cl: mi.languages = [cl] opfnocover = data.get('opf.nocover', False) if not opfnocover: try: read_cover(stream, zf, mi, opfmeta, extract_cover) except Exception: pass # Do not let an error reading the cover prevent reading other data return mi
def process_metadata(self, idx, content, codec): if idx == 100: if self.mi.is_null('authors'): self.mi.authors = [] au = clean_xml_chars(self.decode(content).strip()) self.mi.authors.append(au) if self.mi.is_null('author_sort') and re.match( r'\S+?\s*,\s+\S+', au.strip()): self.mi.author_sort = au.strip() elif idx == 101: self.mi.publisher = clean_xml_chars(self.decode(content).strip()) if self.mi.publisher in {'Unknown', _('Unknown')}: self.mi.publisher = None elif idx == 103: self.mi.comments = clean_xml_chars(self.decode(content).strip()) elif idx == 104: raw = check_isbn(self.decode(content).strip().replace('-', '')) if raw: self.mi.isbn = raw elif idx == 105: if not self.mi.tags: self.mi.tags = [] self.mi.tags.extend([ x.strip() for x in clean_xml_chars(self.decode(content)).split(';') ]) self.mi.tags = list(set(self.mi.tags)) elif idx == 106: try: self.mi.pubdate = parse_date(content, as_utc=False) except: pass elif idx == 108: self.mi.book_producer = clean_xml_chars( self.decode(content).strip()) elif idx == 112: # dc:source set in some EBSP amazon samples try: content = content.decode(codec).strip() isig = 'urn:isbn:' if content.lower().startswith(isig): raw = check_isbn(content[len(isig):]) if raw and not self.mi.isbn: self.mi.isbn = raw elif content.startswith('calibre:'): # calibre book uuid is stored here by recent calibre # releases cid = content[len('calibre:'):] if cid: self.mi.application_id = self.mi.uuid = cid except: pass elif idx == 113: # ASIN or other id try: self.uuid = content.decode('ascii') self.mi.set_identifier('mobi-asin', self.uuid) except: self.uuid = None elif idx == 116: self.start_offset, = struct.unpack(b'>L', content) elif idx == 121: self.kf8_header, = struct.unpack(b'>L', content) if self.kf8_header == NULL_INDEX: self.kf8_header = None
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree XPath = partial(etree.XPath, namespaces=NAMESPACES) # total_results = XPath('//openSearch:totalResults') # start_index = XPath('//openSearch:startIndex') # items_per_page = XPath('//openSearch:itemsPerPage') entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') creator = XPath('descendant::dc:creator') identifier = XPath('descendant::dc:identifier') title = XPath('descendant::dc:title') date = XPath('descendant::dc:date') publisher = XPath('descendant::dc:publisher') subject = XPath('descendant::dc:subject') description = XPath('descendant::dc:description') language = XPath('descendant::dc:language') rating = XPath('descendant::gd:rating[@average]') def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text google_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'google': google_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring( xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) lang = canonicalize_lang(get_text(extra, language)) if lang: mi.language = lang mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in identifier(extra): t = str(x.text).strip() if t[:5].upper() in ('ISBN:', 'LCCN:', 'OCLC:'): if t[:5].upper() == 'ISBN:': t = check_isbn(t[5:]) if t: isbns.append(t) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x.text for x in subject(extra) if x.text] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: from calibre.utils.date import parse_date, utcnow try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) # Ratings for x in rating(extra): try: mi.rating = float(x.get('average')) if mi.rating > 5: mi.rating /= 2 except: log.exception('Failed to parse rating') # Cover mi.has_google_cover = None for x in extra.xpath( '//*[@href and @rel="http://schemas.google.com/books/2008/thumbnail"]' ): mi.has_google_cover = x.get('href') break return mi
def get_metadata_(src, encoding=None): # Meta data definitions as in # http://www.mobileread.com/forums/showpost.php?p=712544&postcount=9 if isbytestring(src): if not encoding: src = xml_to_unicode(src)[0] else: src = src.decode(encoding, 'replace') src = src[:150000] # Searching shouldn't take too long comment_tags = parse_comment_tags(src) meta_tags = parse_meta_tags(src) def get(field): ans = comment_tags.get(field, meta_tags.get(field, None)) if ans: ans = ans.strip() if not ans: ans = None return ans # Title title = get('title') if not title: pat = re.compile('<title>([^<>]+?)</title>', re.IGNORECASE) match = pat.search(src) if match: title = replace_entities(match.group(1)) # Author authors = get('authors') or _('Unknown') # Create MetaInformation with Title and Author mi = Metadata(title or _('Unknown'), string_to_authors(authors)) for field in ('publisher', 'isbn', 'language', 'comments'): val = get(field) if val: setattr(mi, field, val) for field in ('pubdate', 'timestamp'): try: val = parse_date(get(field)) except: pass else: if not is_date_undefined(val): setattr(mi, field, val) # SERIES series = get('series') if series: pat = re.compile(r'\[([.0-9]+)\]$') match = pat.search(series) series_index = None if match is not None: try: series_index = float(match.group(1)) except: pass series = series.replace(match.group(), '').strip() mi.series = series if series_index is None: series_index = get('series_index') try: series_index = float(series_index) except: pass if series_index is not None: mi.series_index = series_index # RATING rating = get('rating') if rating: try: mi.rating = float(rating) if mi.rating < 0: mi.rating = 0 if mi.rating > 5: mi.rating /= 2. if mi.rating > 5: mi.rating = 0 except: pass # TAGS tags = get('tags') if tags: tags = [x.strip() for x in tags.split(',') if x.strip()] if tags: mi.tags = tags return mi
def _parse_my_clippings_original(self): ''' Parse MyClippings.txt for entries matching installed books. File should end with SEPARATOR and a newline. ''' SEPARATOR = '==========' cp = self._get_my_clippings() timestamp_parse_failed = False if cp: lines = [] # Apparently new MyClippings.txt files are encoded UTF-8 with BOM with open(cp) as clippings: for line in clippings: stripped = line.decode('utf-8-sig') lines.append(stripped) index = 0 line = lines[index] while True: # Get to the first title (author_sort) line if re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', lines[index]): break else: while not re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', lines[index]): index += 1 break while index < len(lines) - 1: try: line = lines[index] book_id = None # 1. Get the title/author_sort pair tas = re.match(r'(?P<title>.*)\((?P<author_sort>.*)\)', line) title = tas.group('title').rstrip() author_sort = tas.group('author_sort') # If title/author_sort match book in library, # consider this an active annotation if title in self.installed_books_by_title.keys(): book_id = self.installed_books_by_title[title]['book_id'] index += 1 # 2. Get [Highlight|Bookmark Location|Note] line = lines[index] ann_type = None if 'Highlight' in line: ann_type = 'Highlight' elif 'Bookmark' in line: ann_type = 'Bookmark' elif 'Note' in line: ann_type = 'Note' # Kindle PW uses 'Location', K3 uses 'Loc.'. German uses 'Position' # K3 does not store location with Bookmarks. Whatever. loc = re.match(r'.* (?P<location>(Location|Loc\.|Position) [0-9,-]+).*', line) location = 'Unknown' location_sort = "000000" if loc: location = loc.group('location') location_sort = "%06d" % int(re.match(r'^(Loc\.|Location|Position) (?P<loc>[0-9]+).*$', location).group('loc')) # Try to read the timestamp, fallback to local time try: tstring = re.match(r'.*Added on (?P<timestamp>.*$)', line) ts = tstring.group('timestamp') isoformat = parse_date(ts, as_utc=False) timestamp = mktime(isoformat.timetuple()) except: if not timestamp_parse_failed: self._log(" Unable to parse entries from 'My Clippings.txt'") self._log(" %s driver supports English only." % self.app_name) timestamp_parse_failed = True timestamp = mktime(localtime()) while timestamp in self.active_annotations: timestamp += 1 index += 1 # 3. blank line(s) while lines[index].strip() == '': index += 1 # 4. highlight or note item = lines[index] highlight_text = None note_text = None if ann_type == 'Highlight': highlight_text = [unicode(item)] index += 1 while lines[index].strip() != SEPARATOR: highlight_text.append(unicode(lines[index])) index += 1 elif ann_type == 'Note': note_text = [unicode(item)] index += 1 while lines[index].strip() != SEPARATOR: note_text.append(unicode(lines[index])) index += 1 # Pass SEPARATOR index += 1 # 5. Store the active_annotation if book_id: # Notes and highlights are created simultaneously if timestamp not in self.active_annotations: self.active_annotations[timestamp] = { 'annotation_id': timestamp, 'book_id': book_id, 'highlight_color': 'Gray', 'location': location, 'location_sort': location_sort } if highlight_text is not None: self.active_annotations[timestamp]['highlight_text'] = highlight_text if note_text is not None: self.active_annotations[timestamp]['note_text'] = note_text except: # Unexpected EOF. Return with whatever we have self._log_location("failed with line: %s" % repr(line)) import traceback traceback.print_exc() return
def adapt_datetime(x, d): if isinstance(x, (unicode_type, bytes)): x = parse_date(x, assume_utc=False, as_utc=False) return x
def get_book_detail(self, br, metadata_url, mi, ovrdrv_id, log): from html5_parser import parse from lxml import html from calibre.ebooks.chardet import xml_to_unicode from calibre.library.comments import sanitize_comments_html try: raw = br.open_novisit(metadata_url).read() except Exception as e: if callable(getattr(e, 'getcode', None)) and \ e.getcode() == 404: return False raise raw = xml_to_unicode(raw, strip_encoding_pats=True, resolve_entities=True)[0] try: root = parse(raw, maybe_xhtml=False, sanitize_names=True) except Exception: return False pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" ) if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = { 'english': 'eng', 'french': 'fra', 'german': 'deu', 'spanish': 'spa' }.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print("ebook isbn is "+type('')(ebook_isbn[0])) isbn = check_isbn(ebook_isbn[0].strip()) if isbn: self.cache_isbn_to_identifier(isbn, ovrdrv_id) mi.isbn = isbn if subjects: mi.tags = [tag.strip() for tag in subjects[0].split(',')] if desc: desc = desc[0] desc = html.tostring(desc, method='html', encoding='unicode').strip() # remove all attributes from tags desc = re.sub(r'<([a-zA-Z0-9]+)\s[^>]+>', r'<\1>', desc) # Remove comments desc = re.sub(r'(?s)<!--.*?-->', '', desc) mi.comments = sanitize_comments_html(desc) return None
def itervals(self, record): for name, fm in self.entries: dt = fm['datatype'] val = record[fm['rec_index']] if dt == 'composite': sb = fm['display'].get('composite_sort', 'text') if sb == 'date': try: val = parse_date(val) except: val = UNDEFINED_DATE dt = 'datetime' elif sb == 'number': try: p = 1 for i, candidate in enumerate( ('B', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB')): if val.endswith(candidate): p = 1024**(i) val = val[:-len(candidate)].strip() break val = locale.atof(val) * p except: val = 0.0 dt = 'float' elif sb == 'bool': val = force_to_bool(val) dt = 'bool' if dt == 'datetime': if val is None: val = UNDEFINED_DATE if tweaks['sort_dates_using_visible_fields']: format = None if name == 'timestamp': format = tweaks['gui_timestamp_display_format'] elif name == 'pubdate': format = tweaks['gui_pubdate_display_format'] elif name == 'last_modified': format = tweaks['gui_last_modified_display_format'] elif fm['is_custom']: format = fm['display'].get('date_format', None) val = clean_date_for_sort(val, format) elif dt == 'series': if val is None: val = ('', 1) else: if self.library_order: try: lang = record[self.lang_idx].partition(u',')[0] except (AttributeError, ValueError, KeyError, IndexError, TypeError): lang = None val = title_sort(val, order='library_order', lang=lang) sidx_fm = self.field_metadata[name + '_index'] sidx = record[sidx_fm['rec_index']] val = (self.string_sort_key(val), sidx) elif dt in ('text', 'comments', 'composite', 'enumeration'): if val: if fm['is_multiple']: jv = fm['is_multiple']['list_to_ui'] sv = fm['is_multiple']['cache_to_list'] if '&' in jv: val = jv.join([ author_to_author_sort(v) for v in val.split(sv) ]) else: val = jv.join( sorted(val.split(sv), key=self.string_sort_key)) val = self.string_sort_key(val) elif dt == 'bool': if not self.db_prefs.get('bools_are_tristate'): val = {True: 1, False: 2, None: 2}.get(val, 2) else: val = {True: 1, False: 2, None: 3}.get(val, 3) yield val
def to_metadata(self, browser, log, entry_, timeout): # {{{ from calibre.utils.date import parse_date, utcnow douban_id = entry_.get("id") title = entry_.get("title") description = entry_.get("summary") # subtitle = entry_.get('subtitle') # TODO: std metada doesn't have this field publisher = entry_.get("publisher") isbn = entry_.get("isbn13") # ISBN11 is obsolute, use ISBN13 pubdate = entry_.get("pubdate") authors = entry_.get("author") book_tags = entry_.get("tags") rating = entry_.get("rating") cover_url = entry_.get("images", {}).get("large") series = entry_.get("series") if not authors: authors = [_("Unknown")] if not douban_id or not title: # Silently discard this entry return None mi = Metadata(title, authors) mi.identifiers = {"douban": douban_id} mi.publisher = publisher mi.comments = description # mi.subtitle = subtitle # ISBN isbns = [] if isinstance(isbn, (type(""), bytes)): if check_isbn(isbn): isbns.append(isbn) else: for x in isbn: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags mi.tags = [tag["name"] for tag in book_tags] # pubdate if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error("Failed to parse pubdate %r" % pubdate) # Ratings if rating: try: mi.rating = float(rating["average"]) / 2.0 except: log.exception("Failed to parse rating") mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url if u: # If URL contains "book-default", the book doesn't have a cover if u.find("book-default") == -1: mi.has_douban_cover = u # Series if series: mi.series = series["title"] return mi
pub_date = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblPubDate']/text()") lang = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblLanguage']/text()") subjects = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblSubjects']/text()") ebook_isbn = root.xpath( "//td/label[@id='ctl00_ContentPlaceHolder1_lblIdentifier']/text()") desc = root.xpath( "//div/label[@id='ctl00_ContentPlaceHolder1_lblDescription']/ancestor::div[1]" ) if pub_date: from calibre.utils.date import parse_date try: mi.pubdate = parse_date(pub_date[0].strip()) except: pass if lang: lang = lang[0].strip().lower() lang = { 'english': 'eng', 'french': 'fra', 'german': 'deu', 'spanish': 'spa' }.get(lang, None) if lang: mi.language = lang if ebook_isbn: # print "ebook isbn is "+str(ebook_isbn[0])
def to_metadata(browser, log, entry_, timeout): # {{{ from lxml import etree from calibre.ebooks.chardet import xml_to_unicode from calibre.utils.date import parse_date, utcnow from calibre.utils.cleantext import clean_ascii_chars XPath = partial(etree.XPath, namespaces=NAMESPACES) entry = XPath('//atom:entry') entry_id = XPath('descendant::atom:id') title = XPath('descendant::atom:title') description = XPath('descendant::atom:summary') publisher = XPath("descendant::db:attribute[@name='publisher']") isbn = XPath("descendant::db:attribute[@name='isbn13']") date = XPath("descendant::db:attribute[@name='pubdate']") creator = XPath("descendant::db:attribute[@name='author']") booktag = XPath("descendant::db:tag/attribute::name") rating = XPath("descendant::gd:rating/attribute::average") cover_url = XPath("descendant::atom:link[@rel='image']/attribute::href") def get_text(extra, x): try: ans = x(extra) if ans: ans = ans[0].text if ans and ans.strip(): return ans.strip() except: log.exception('Programming error:') return None id_url = entry_id(entry_)[0].text douban_id = id_url.split('/')[-1] title_ = ': '.join([x.text for x in title(entry_)]).strip() authors = [x.text.strip() for x in creator(entry_) if x.text] if not authors: authors = [_('Unknown')] if not id_url or not title: # Silently discard this entry return None mi = Metadata(title_, authors) mi.identifiers = {'douban':douban_id} try: raw = get_details(browser, id_url, timeout) feed = etree.fromstring(xml_to_unicode(clean_ascii_chars(raw), strip_encoding_pats=True)[0]) extra = entry(feed)[0] except: log.exception('Failed to get additional details for', mi.title) return mi mi.comments = get_text(extra, description) mi.publisher = get_text(extra, publisher) # ISBN isbns = [] for x in [t.text for t in isbn(extra)]: if check_isbn(x): isbns.append(x) if isbns: mi.isbn = sorted(isbns, key=len)[-1] mi.all_isbns = isbns # Tags try: btags = [x for x in booktag(extra) if x] tags = [] for t in btags: atags = [y.strip() for y in t.split('/')] for tag in atags: if tag not in tags: tags.append(tag) except: log.exception('Failed to parse tags:') tags = [] if tags: mi.tags = [x.replace(',', ';') for x in tags] # pubdate pubdate = get_text(extra, date) if pubdate: try: default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r'%pubdate) # Ratings if rating(extra): try: mi.rating = float(rating(extra)[0]) / 2.0 except: log.exception('Failed to parse rating') mi.rating = 0 # Cover mi.has_douban_cover = None u = cover_url(extra) if u: u = u[0].replace('/spic/', '/lpic/'); # If URL contains "book-default", the book doesn't have a cover if u.find('book-default') == -1: mi.has_douban_cover = u return mi
def retrieve_bokelai_detail(self, bokelai_id, log, result_queue, timeout): detail_url = self.BOKELAI_DETAIL_URL % bokelai_id log.info(detail_url) try: br = self.browser _raw = br.open_novisit(detail_url, timeout=timeout) raw = _raw.read() except Exception as e: log.exception('Failed to load detail page: %s' % detail_url) return root = etree.HTML(raw) info_json_text = root.xpath( "//script[@type='application/ld+json']")[0].text log.info(info_json_text) info_json = json.loads(info_json_text) title = info_json['name'] authors = info_json['author'][0]['name'].split(",") publisher = info_json['publisher'][0]['name'] isbn = info_json['workExample']['workExample']['isbn'] pubdate = info_json['datePublished'] comments = "" comments_ele = root.xpath("(//div[@class='content'])[1]//text()") comments = "\n".join(comments_ele) tags = list() for ele in root.xpath("//li[contains(text(),'本書分類:')]/a"): log.info(ele.text) if "/" in ele.text: tags.extend(ele.text.split("/")) if "/" in ele.text: tags.extend(ele.text.split("/")) else: tags.append(ele.text) cover_url = re.search(r'https[^\?\=\&]*' + bokelai_id + r'[^\?\=\&]*', info_json['image']).group(0) if not authors: authors = [_('Unknown')] log.info(title, authors, publisher, isbn, pubdate, comments, tags, cover_url) mi = Metadata(title, authors) mi.identifiers = {'bokelai': bokelai_id, 'isbn': isbn} mi.publisher = publisher mi.comments = comments mi.isbn = isbn mi.tags = tags if pubdate: try: from calibre.utils.date import parse_date, utcnow default = utcnow().replace(day=15) mi.pubdate = parse_date(pubdate, assume_utc=True, default=default) except: log.error('Failed to parse pubdate %r' % pubdate) if not cover_url is None: mi.has_bokelai_cover = cover_url self.cache_identifier_to_cover_url(mi.identifiers['bokelai'], mi.has_bokelai_cover) else: mi.has_bokelai_cover = None result_queue.put(mi)