Python MetaInformation.set_identifier Examples

Programming Language: Python

Namespace/Package Name: calibre.ebooks.metadata

Class/Type: MetaInformation

Method/Function: set_identifier

Examples at hotexamples.com: 5

Python MetaInformation.set_identifier - 5 examples found. These are the top rated real world Python examples of calibre.ebooks.metadata.MetaInformation.set_identifier extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MetaInformation(30)

tags(27)

title(23)

authors(20)

publisher(18)

series(17)

comments(16)

cover_data(13)

isbn(12)

pubdate(10)

languages(9)

series_index(7)

language(6)

rating(6)

author_sort(6)

title_sort(5)

book_producer(4)

set_identifiers(4)

author(4)

get_identifiers(3)

timestamp(3)

is_null(3)

smart_update(2)

identifiers(2)

set_identifier(2)

category(2)

cover(2)

rights(1)

application_id(1)

to_html(1)

Example #1

Show file

class EXTHHeader(object): # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503: # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = content.decode(codec)
                except:
                    pass
            elif idx == 524: # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            #else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(title)

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = content.decode(codec, 'ignore').strip()
            self.mi.authors.append(au)
            if self.mi.is_null('author_sort') and re.match(r'\S+?\s*,\s+\S+', au.strip()):
                self.mi.author_sort = au.strip()
        elif idx == 101:
            self.mi.publisher = content.decode(codec, 'ignore').strip()
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = content.decode(codec, 'ignore')
        elif idx == 104:
            raw = check_isbn(content.decode(codec, 'ignore').strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in content.decode(codec,
                'ignore').split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = content.decode(codec, 'ignore').strip()
        elif idx == 112: # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
            except:
                pass
        elif idx == 113: # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None

Example #2

Show file

File: headers.py Project: JimmXinu/calibre

class EXTHHeader(object):  # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = clean_xml_chars(self.decode(content).strip())
            # Author names in Amazon  MOBI files are usually in LN, FN format,
            # try to detect and auto-correct that.
            m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
            if m is not None:
                if tweaks['author_sort_copy_method'] != 'copy':
                    self.mi.authors.append(m.group(2) + ' ' + m.group(1))
                else:
                    self.mi.authors.append(m.group())
                if self.mi.is_null('author_sort'):
                    self.mi.author_sort = m.group()
            else:
                self.mi.authors.append(au)
        elif idx == 101:
            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = clean_xml_chars(self.decode(content).strip())
        elif idx == 104:
            raw = check_isbn(self.decode(content).strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
        elif idx == 109:
            self.mi.rights = clean_xml_chars(self.decode(content).strip())
        elif idx == 112:  # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
                elif content.startswith('calibre:'):
                    # calibre book uuid is stored here by recent calibre
                    # releases
                    cid = content[len('calibre:'):]
                    if cid:
                        self.mi.application_id = self.mi.uuid = cid
            except:
                pass
        elif idx == 113:  # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None

Example #3

Show file

def get_metadata(stream, cover=True):
    with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
                    (pdfpath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError('Failed to run pdfinfo')
        info = res['result']
        with open(res['stdout_stderr'], 'rb') as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if info is None:
            raise ValueError('Could not read info dict from PDF')
        covpath = os.path.join(pdfpath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None) or _('Unknown')
    au = info.get('Author', None)
    if au is None:
        au = [_('Unknown')]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)
    # if isbn is not None:
    #    mi.isbn = isbn

    creator = info.get('Creator', None)
    if creator:
        mi.book_producer = creator

    keywords = info.get('Keywords', None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(',')]
        isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
        if isbn:
            mi.isbn = isbn = isbn[0]
        mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]

    subject = info.get('Subject', None)
    if subject:
        mi.tags.insert(0, subject)

    if 'xmp_metadata' in info:
        from calibre.ebooks.metadata.xmp import consolidate_metadata
        mi = consolidate_metadata(mi, info)

    # Look for recognizable identifiers in the info dict, if they were not
    # found in the XMP metadata
    for scheme, check_func in iteritems({'doi':check_doi, 'isbn':check_isbn}):
        if scheme not in mi.get_identifiers():
            for k, v in iteritems(info):
                if k != 'xmp_metadata':
                    val = check_func(v)
                    if val:
                        mi.set_identifier(scheme, val)
                        break

    if cdata:
        mi.cover_data = ('jpeg', cdata)
    return mi

Example #4

Show file

File: headers.py Project: T30rGRB7/calibre-python3

class EXTHHeader(object):  # {{{

    def __init__(self, raw, codec, title):
        self.doctype = raw[:4]
        self.length, self.num_items = struct.unpack('>LL', raw[4:12])
        raw = raw[12:]
        pos = 0
        self.mi = MetaInformation(_('Unknown'), [_('Unknown')])
        self.has_fake_cover = True
        self.start_offset = None
        left = self.num_items
        self.kf8_header = None
        self.uuid = self.cdetype = None
        self.page_progression_direction = None
        self.primary_writing_mode = None

        self.decode = lambda x : clean_ascii_chars(x.decode(codec, 'replace'))

        while left > 0:
            left -= 1
            idx, size = struct.unpack('>LL', raw[pos:pos + 8])
            content = raw[pos + 8:pos + size]
            pos += size
            if idx >= 100 and idx < 200:
                self.process_metadata(idx, content, codec)
            elif idx == 203:
                self.has_fake_cover = bool(struct.unpack('>L', content)[0])
            elif idx == 201:
                co, = struct.unpack('>L', content)
                if co < NULL_INDEX:
                    self.cover_offset = co
            elif idx == 202:
                self.thumbnail_offset, = struct.unpack('>L', content)
            elif idx == 501:
                try:
                    self.cdetype = content.decode('ascii')
                except UnicodeDecodeError:
                    self.cdetype = None
                # cdetype
                if content == b'EBSP':
                    if not self.mi.tags:
                        self.mi.tags = []
                    self.mi.tags.append(_('Sample Book'))
            elif idx == 502:
                # last update time
                pass
            elif idx == 503:  # Long title
                # Amazon seems to regard this as the definitive book title
                # rather than the title from the PDB header. In fact when
                # sending MOBI files through Amazon's email service if the
                # title contains non ASCII chars or non filename safe chars
                # they are messed up in the PDB header
                try:
                    title = self.decode(content)
                except:
                    pass
            elif idx == 524:  # Lang code
                try:
                    lang = content.decode(codec)
                    lang = canonicalize_lang(lang)
                    if lang:
                        self.mi.language = lang
                except:
                    pass
            elif idx == 525:
                try:
                    pwm = content.decode(codec)
                    if pwm:
                        self.primary_writing_mode = pwm
                except Exception:
                    pass
            elif idx == 527:
                try:
                    ppd = content.decode(codec)
                    if ppd:
                        self.page_progression_direction = ppd
                except Exception:
                    pass
            # else:
            #    print 'unknown record', idx, repr(content)
        if title:
            self.mi.title = replace_entities(clean_xml_chars(clean_ascii_chars(title)))

    def process_metadata(self, idx, content, codec):
        if idx == 100:
            if self.mi.is_null('authors'):
                self.mi.authors = []
            au = clean_xml_chars(self.decode(content).strip())
            # Author names in Amazon  MOBI files are usually in LN, FN format,
            # try to detect and auto-correct that.
            m = re.match(r'([^,]+?)\s*,\s+([^,]+)$', au.strip())
            if m is not None:
                if tweaks['author_sort_copy_method'] != 'copy':
                    self.mi.authors.append(m.group(2) + ' ' + m.group(1))
                else:
                    self.mi.authors.append(m.group())
                if self.mi.is_null('author_sort'):
                    self.mi.author_sort = m.group()
            else:
                self.mi.authors.append(au)
        elif idx == 101:
            self.mi.publisher = clean_xml_chars(self.decode(content).strip())
            if self.mi.publisher in {'Unknown', _('Unknown')}:
                self.mi.publisher = None
        elif idx == 103:
            self.mi.comments  = clean_xml_chars(self.decode(content).strip())
        elif idx == 104:
            raw = check_isbn(self.decode(content).strip().replace('-', ''))
            if raw:
                self.mi.isbn = raw
        elif idx == 105:
            if not self.mi.tags:
                self.mi.tags = []
            self.mi.tags.extend([x.strip() for x in clean_xml_chars(self.decode(content)).split(';')])
            self.mi.tags = list(set(self.mi.tags))
        elif idx == 106:
            try:
                self.mi.pubdate = parse_date(content, as_utc=False)
            except:
                pass
        elif idx == 108:
            self.mi.book_producer = clean_xml_chars(self.decode(content).strip())
        elif idx == 109:
            self.mi.rights = clean_xml_chars(self.decode(content).strip())
        elif idx == 112:  # dc:source set in some EBSP amazon samples
            try:
                content = content.decode(codec).strip()
                isig = 'urn:isbn:'
                if content.lower().startswith(isig):
                    raw = check_isbn(content[len(isig):])
                    if raw and not self.mi.isbn:
                        self.mi.isbn = raw
                elif content.startswith('calibre:'):
                    # calibre book uuid is stored here by recent calibre
                    # releases
                    cid = content[len('calibre:'):]
                    if cid:
                        self.mi.application_id = self.mi.uuid = cid
            except:
                pass
        elif idx == 113:  # ASIN or other id
            try:
                self.uuid = content.decode('ascii')
                self.mi.set_identifier('mobi-asin', self.uuid)
            except:
                self.uuid = None
        elif idx == 116:
            self.start_offset, = struct.unpack(b'>L', content)
        elif idx == 121:
            self.kf8_header, = struct.unpack(b'>L', content)
            if self.kf8_header == NULL_INDEX:
                self.kf8_header = None

Example #5

Show file

File: pdf.py Project: Aliminator666/calibre

def get_metadata(stream, cover=True):
    with TemporaryDirectory('_pdf_metadata_read') as pdfpath:
        stream.seek(0)
        with open(os.path.join(pdfpath, 'src.pdf'), 'wb') as f:
            shutil.copyfileobj(stream, f)
        try:
            res = fork_job('calibre.ebooks.metadata.pdf', 'read_info',
                    (pdfpath, bool(cover)))
        except WorkerError as e:
            prints(e.orig_tb)
            raise RuntimeError('Failed to run pdfinfo')
        info = res['result']
        with open(res['stdout_stderr'], 'rb') as f:
            raw = f.read().strip()
            if raw:
                prints(raw)
        if not info:
            raise ValueError('Could not read info dict from PDF')
        covpath = os.path.join(pdfpath, 'cover.jpg')
        cdata = None
        if cover and os.path.exists(covpath):
            with open(covpath, 'rb') as f:
                cdata = f.read()

    title = info.get('Title', None)
    au = info.get('Author', None)
    if au is None:
        au = [_('Unknown')]
    else:
        au = string_to_authors(au)
    mi = MetaInformation(title, au)
    # if isbn is not None:
    #    mi.isbn = isbn

    creator = info.get('Creator', None)
    if creator:
        mi.book_producer = creator

    keywords = info.get('Keywords', None)
    mi.tags = []
    if keywords:
        mi.tags = [x.strip() for x in keywords.split(',')]
        isbn = [check_isbn(x) for x in mi.tags if check_isbn(x)]
        if isbn:
            mi.isbn = isbn = isbn[0]
        mi.tags = [x for x in mi.tags if check_isbn(x) != isbn]

    subject = info.get('Subject', None)
    if subject:
        mi.tags.insert(0, subject)

    if 'xmp_metadata' in info:
        from calibre.ebooks.metadata.xmp import consolidate_metadata
        mi = consolidate_metadata(mi, info)

    # Look for recognizable identifiers in the info dict, if they were not
    # found in the XMP metadata
    for scheme, check_func in {'doi':check_doi, 'isbn':check_isbn}.iteritems():
        if scheme not in mi.get_identifiers():
            for k, v in info.iteritems():
                if k != 'xmp_metadata':
                    val = check_func(v)
                    if val:
                        mi.set_identifier(scheme, val)
                        break

    if cdata:
        mi.cover_data = ('jpeg', cdata)
    return mi