Example #1
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:
                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)
                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #2
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, 'replace')
    name = name.rpartition('.')[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get('filename_pattern'))
    name = name.replace('_', ' ')
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group('title')
        except IndexError:
            pass
        try:
            au = match.group('author')
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs['swap_author_names'] and mi.authors:

                    def swap(a):
                        if ',' in a:
                            parts = a.split(',', 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return ' '.join(parts)

                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group('series')
        except IndexError:
            pass
        try:
            si = match.group('series_index')
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group('isbn')
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group('publisher')
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group('published')
            if pubdate:
                from calibre.utils.date import parse_only_date
                mi.pubdate = parse_only_date(pubdate)
        except:
            pass
        try:
            comments = match.group('comments')
            mi.comments = comments
        except (IndexError, ValueError):
            pass

    if mi.is_null('title'):
        mi.title = name
    return mi
Example #3
0
class EXTHHeader(object):  # {{{
    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x: clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(
                clean_xml_chars(clean_ascii_chars(title)))

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = clean_xml_chars(self.decode(content).strip())
            # Author names in Amazon  MOBI files are usually in LN, FN format,
            # try to detect and auto-correct that.
            m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
            if m is not None:
                if tweaks['author_sort_copy_method'] != 'copy':
                    self.mi.authors.append(m.group(2) + ' ' + m.group(1))
                else:
                    self.mi.authors.append(m.group())
                if self.mi.is_null('author_sort'):
                    self.mi.author_sort = m.group()
            else:
                self.mi.authors.append(au)
        elif idx == 101:
            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments = clean_xml_chars(self.decode(content).strip())
        elif idx == 104:
            raw = check_isbn(self.decode(content).strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([
                x.strip()
                for x in clean_xml_chars(self.decode(content)).split(';')
            ])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = clean_xml_chars(
                self.decode(content).strip())
        elif idx == 109:
            self.mi.rights = clean_xml_chars(self.decode(content).strip())
        elif idx == 112:  # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
                elif content.startswith('calibre:'):
                    # calibre book uuid is stored here by recent calibre
                    # releases
                    cid = content[len('calibre:'):]
                    if cid:
                        self.mi.application_id = self.mi.uuid = cid
            except:
                pass
        elif idx == 113:  # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None
Example #4
0
class EXTHHeader(object): # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503: # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = content.decode(codec)
                except:
                    pass
            elif idx == 524: # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            #else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(title)

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = content.decode(codec, 'ignore').strip()
            self.mi.authors.append(au)
            if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
                self.mi.author_sort = au.strip()
        elif idx == 101:
            self.mi.publisher = content.decode(codec, 'ignore').strip()
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = content.decode(codec, 'ignore')
        elif idx == 104:
            raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in content.decode(codec,
                'ignore').split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = content.decode(codec, 'ignore').strip()
        elif idx == 112: # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
            except:
                pass
        elif idx == 113: # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None
Example #5
0
class EXTHHeader(object):  # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = clean_xml_chars(self.decode(content).strip())
            # Author names in Amazon  MOBI files are usually in LN, FN format,
            # try to detect and auto-correct that.
            m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
            if m is not None:
                if tweaks['author_sort_copy_method'] != 'copy':
                    self.mi.authors.append(m.group(2) + ' ' + m.group(1))
                else:
                    self.mi.authors.append(m.group())
                if self.mi.is_null('author_sort'):
                    self.mi.author_sort = m.group()
            else:
                self.mi.authors.append(au)
        elif idx == 101:
            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = clean_xml_chars(self.decode(content).strip())
        elif idx == 104:
            raw = check_isbn(self.decode(content).strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
        elif idx == 109:
            self.mi.rights = clean_xml_chars(self.decode(content).strip())
        elif idx == 112:  # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
                elif content.startswith('calibre:'):
                    # calibre book uuid is stored here by recent calibre
                    # releases
                    cid = content[len('calibre:'):]
                    if cid:
                        self.mi.application_id = self.mi.uuid = cid
            except:
                pass
        elif idx == 113:  # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None
Example #6
0
class EXTHHeader(object): # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503: # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = content.decode(codec)
                except:
                    pass
            #else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(title)

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = content.decode(codec, 'ignore').strip()
            self.mi.authors.append(au)
            if re.match(r'\S+?\s*,\s+\S+', au.strip()):
                self.mi.author_sort = au.strip()
        elif idx == 101:
            self.mi.publisher = content.decode(codec, 'ignore').strip()
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = content.decode(codec, 'ignore')
        elif idx == 104:
            raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in content.decode(codec,
                'ignore').split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = content.decode(codec, 'ignore').strip()
        elif idx == 112: # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
            except:
                pass
        elif idx == 113: # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None
Example #7
0
def metadata_from_filename(name, pat=None, fallback_pat=None):
    if isbytestring(name):
        name = name.decode(filesystem_encoding, "replace")
    name = name.rpartition(".")[0]
    mi = MetaInformation(None, None)
    if pat is None:
        pat = re.compile(prefs.get("filename_pattern"))
    name = name.replace("_", " ")
    match = pat.search(name)
    if match is None and fallback_pat is not None:
        match = fallback_pat.search(name)
    if match is not None:
        try:
            mi.title = match.group("title")
        except IndexError:
            pass
        try:
            au = match.group("author")
            aus = string_to_authors(au)
            if aus:
                mi.authors = aus
                if prefs["swap_author_names"] and mi.authors:

                    def swap(a):
                        if "," in a:
                            parts = a.split(",", 1)
                        else:
                            parts = a.split(None, 1)
                        if len(parts) > 1:
                            t = parts[-1]
                            parts = parts[:-1]
                            parts.insert(0, t)
                        return " ".join(parts)

                    mi.authors = [swap(x) for x in mi.authors]
        except (IndexError, ValueError):
            pass
        try:
            mi.series = match.group("series")
        except IndexError:
            pass
        try:
            si = match.group("series_index")
            mi.series_index = float(si)
        except (IndexError, ValueError, TypeError):
            pass
        try:
            si = match.group("isbn")
            mi.isbn = si
        except (IndexError, ValueError):
            pass
        try:
            publisher = match.group("publisher")
            mi.publisher = publisher
        except (IndexError, ValueError):
            pass
        try:
            pubdate = match.group("published")
            if pubdate:
                from calibre.utils.date import parse_only_date

                mi.pubdate = parse_only_date(pubdate)
        except:
            pass

    if mi.is_null("title"):
        mi.title = name
    return mi