def get_book_info(fname): # only handles epub, mobi and opf for now, # for pdf see below res = {} if '.' not in fname: return res words = fname.split('.') extn = words[len(words) - 1] if extn == "mobi": try: book = Mobi(fname) book.parse() except: return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() res['type'] = "mobi" return res """ # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # # if (extn == "pdf"): # pdf = PdfFileReader(open(fname, "rb")) # txt = pdf.getDocumentInfo() # repackage the data here to get components we need # res = {} # for s in ['title','language','creator']: # res[s] = txt[s] # res['identifier'] = txt['isbn'] # res['type'] = "pdf" # return res """ if extn == "epub": # prepare to read from the .epub file zipdata = zipfile.ZipFile(fname) # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if 'full-path' in att: cfname = att['full-path'] n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) tree = ElementTree.fromstring(txt) res['type'] = "epub" else: if extn == "opf": txt = open(fname).read() tree = ElementTree.fromstring(txt) res['type'] = "opf" else: return "" # repackage the data if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib) isbn = "" if 'title' in tag.lower(): res['title'] = txt elif 'language' in tag.lower(): res['language'] = txt elif 'creator' in tag.lower(): res['creator'] = txt elif 'identifier' in tag.lower() and 'isbn' in attrib.lower(): if formatter.is_valid_isbn(txt): res['identifier'] = txt n = n + 1 return res
def get_book_info(fname): # only handles epub, mobi, azw3 and opf for now, # for pdf see notes below res = {} extn = os.path.splitext(fname)[1] if not extn: return res if extn == ".mobi" or extn == ".azw3": res['type'] = extn[1:] try: book = Mobi(fname) book.parse() except Exception as e: logger.debug('Unable to parse mobi in %s, %s' % (fname, str(e))) return res res['creator'] = book.author() res['title'] = book.title() res['language'] = book.language() res['identifier'] = book.isbn() return res """ # none of the pdfs in my library had language,isbn # most didn't have author, or had the wrong author # (author set to publisher, or software used) # so probably not much point in looking at pdfs # if (extn == ".pdf"): pdf = PdfFileReader(open(fname, "rb")) txt = pdf.getDocumentInfo() # repackage the data here to get components we need res = {} for s in ['title','language','creator']: res[s] = txt[s] res['identifier'] = txt['isbn'] res['type'] = "pdf" return res """ elif extn == ".epub": res['type'] = "epub" # prepare to read from the .epub file try: zipdata = zipfile.ZipFile(fname) except Exception as e: logger.debug('Unable to parse zipfile %s, %s' % (fname, str(e))) return res # find the contents metafile txt = zipdata.read('META-INF/container.xml') tree = ElementTree.fromstring(txt) n = 0 cfname = "" if not len(tree): return res while n < len(tree[0]): att = tree[0][n].attrib if 'full-path' in att: cfname = att['full-path'] break n = n + 1 # grab the metadata block from the contents metafile txt = zipdata.read(cfname) elif extn == ".opf": res['type'] = "opf" txt = open(fname).read() # sanitize any unmatched html tags or ElementTree won't parse dic = {'<br>': '', '</br>': ''} txt = replace_all(txt, dic) # repackage epub or opf metadata try: tree = ElementTree.fromstring(txt) except Exception as e: logger.error("Error parsing metadata from %s, %s" % (fname, str(e))) return res if not len(tree): return res n = 0 while n < len(tree[0]): tag = str(tree[0][n].tag).lower() if '}' in tag: tag = tag.split('}')[1] txt = tree[0][n].text attrib = str(tree[0][n].attrib).lower() if 'title' in tag: res['title'] = txt elif 'language' in tag: res['language'] = txt elif 'creator' in tag: res['creator'] = txt elif 'identifier' in tag and 'isbn' in attrib: if is_valid_isbn(txt): res['identifier'] = txt n = n + 1 return res