def process_mobi_metadata (metadata): ebook = EbookMetadata(metadata.filetype) if 'mobi' not in metadata: return ebook # Determine the codec used for strings. Defaults to 'cp1252' codec = 'cp1252' if 'text_encoding' in metadata.mobi: try: codec = {1252:'cp1252', 65001:'utf-8'}[metadata.mobi.text_encoding] except (IndexError, KeyError): print "Unknown codepage %d. Assuming '%s'" % (metadata.mobi.text_encoding, codec) try: ebook.title = metadata.mobi.fullname.decode(codec, 'replace') except AttributeError: ebook.title = re.sub('[^-A-Za-z0-9\"";:., ]+', '_', metadata.pdb.name.replace('\x00', '')) ebook.setdefault('languages', []).append(mobi2iana_language(metadata.mobi.locale)) if 'exth' in metadata.mobi: for record in metadata.mobi.exth.records: if record.type == 100: from biblio.ebook import parse_ebook_authors authors = parse_ebook_authors(record.data.decode(codec, 'ignore').strip()) ebook.setdefault('authors', []).extend(authors) elif record.type == 101: ebook.publisher = record.data.decode(codec, 'ignore').strip() elif record.type == 103: ebook.description = record.data.decode(codec, 'ignore') elif record.type == 104: ebook.setdefault('identifiers', {})['isbn'] = record.data.decode(codec, 'ignore').strip().replace('-', '') elif record.type == 105: tags = [ t.strip() for t in record.data.decode(codec, 'ignore').split(';') ] ebook.setdefault('tags', []).extend(tags) ebook.tags = list(set(ebook.tags)) elif record.type == 106: from biblio.ebook import parse_ebook_date ebook.date_published = parse_ebook_date(record.data.decode(codec, 'ignore')) elif record.type == 108: pass # contributor elif record.type == 109: ebook.rights = record.data.decode(codec, 'ignore') elif record.type == 503: ebook.title = record.data.decode(codec, 'ignore') if ebook.title: ebook.title = replace_entities(ebook.title, codec) return ebook
def process_opf_metadata (metadata, ebook): for tag,attribs,text in metadata.metadata: if tag == '{http://purl.org/dc/elements/1.1/}title': ebook.title = text.strip() elif tag == '{http://purl.org/dc/elements/1.1/}publisher': ebook.publisher = text.strip() elif tag == '{http://purl.org/dc/elements/1.1/}date': from biblio.ebook import parse_ebook_date ebook.date_published = parse_ebook_date(text.strip()) elif tag == '{http://purl.org/dc/elements/1.1/}description': ebook.description = text.strip() elif tag == '{http://purl.org/dc/elements/1.1/}rights': ebook.rights = text.strip() elif tag == '{http://purl.org/dc/elements/1.1/}language': ebook.setdefault('languages', []).append(text.strip()) elif tag == '{http://purl.org/dc/elements/1.1/}subject': if text and text.strip(): ebook.setdefault('tags', []).extend([ x.strip() for x in text.split(',')]) elif tag == '{http://purl.org/dc/elements/1.1/}identifier': if text and text.strip(): for attr,val in attribs.iteritems(): if attr.endswith('scheme'): typ = val.lower() ebook.setdefault('identifiers', {})[typ] = text.strip() elif tag == '{http://purl.org/dc/elements/1.1/}creator': if ('role' in attribs and attribs['role'] == 'aut') or \ ('{http://www.idpf.org/2007/opf}role' in attribs and attribs['{http://www.idpf.org/2007/opf}role'] == 'aut') or \ ('role' not in attribs and '{http://www.idpf.org/2007/opf}role' not in attribs): from biblio.ebook import parse_ebook_authors ebook.setdefault('authors', []).extend(parse_ebook_authors(text)) elif tag == '{http://www.idpf.org/2007/opf}meta': if not ('name' in attribs and 'content' in attribs): continue name = attribs['name'] content = attribs['content'] if name == 'calibre:series': ebook.series = content.strip() elif name == 'calibre:series_index': ebook.series_index = float(content.strip()) return ebook