Beispiel #1
0
def get_book_info(fname):
    # only handles epub, mobi and opf for now,
    # for pdf see below
    res = {}
    if not '.' in fname:
        return res
    words = fname.split('.')
    extn = words[len(words) - 1]

    if extn == "mobi":
        try:
            book = Mobi(fname)
            book.parse()
        except:
            return res
        res['creator'] = book.author()
        res['title'] = book.title()
        res['language'] = book.language()
        res['identifier'] = book.isbn()
        return res

    # none of the pdfs in my library had language,isbn
    # most didn't have author, or had the wrong author
    # (author set to publisher, or software used)
    # so probably not much point in looking at pdfs
    #
    # if (extn == "pdf"):
    #	  pdf = PdfFileReader(open(fname, "rb"))
    #	  txt = pdf.getDocumentInfo()
          # repackage the data here to get components we need
    #     res = {}
    #     for s in ['title','language','creator']:
    #         res[s] = txt[s]
    #	  res['identifier'] = txt['isbn']
    #	  return res

    if extn == "epub":
        # prepare to read from the .epub file
        zipdata = zipfile.ZipFile(fname)
        # find the contents metafile
        txt = zipdata.read('META-INF/container.xml')
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        if not len(tree):
            return res
        while n < len(tree[0]):
            att = str(tree[0][n].attrib)
            if 'full-path' in att:
                cfname = ("%s" % att)  # extract metadata filename
                cfname = cfname.split(',')[1].split(':')[1].strip('\' }')
            n = n + 1

        # grab the metadata block from the contents metafile
        txt = zipdata.read(cfname)
        tree = ElementTree.fromstring(txt)
    else:
        if extn == "opf":
            txt = open(fname).read()
            tree = ElementTree.fromstring(txt)
        else:
            return ""

    # repackage the data
    if not len(tree):
        return res
    n = 0
    while n < len(tree[0]):
        tag = str(tree[0][n].tag).split('}')[1]
        txt = tree[0][n].text
        attrib = str(tree[0][n].attrib)
        isbn = ""
        if 'title' in tag.lower():
            res['title'] = txt
        elif 'language' in tag.lower():
            res['language'] = txt
        elif 'creator' in tag.lower():
            res['creator'] = txt
        elif 'identifier' in tag.lower() and 'isbn' in attrib.lower():
            if formatter.is_valid_isbn(txt):
                res['identifier'] = isbn
        n = n + 1
    return res
Beispiel #2
0
def get_book_info(fname):
    # only handles epub, mobi and opf for now,
    # for pdf see notes below
    res = {}
    extn = os.path.splitext(fname)[1]
    if not extn:
        return res
    if extn == ".mobi":
        res['type'] = "mobi"
        try:
            book = Mobi(fname)
            book.parse()
        except:
            return res
        res['creator'] = book.author()
        res['title'] = book.title()
        res['language'] = book.language()
        res['identifier'] = book.isbn()
        return res

        # none of the pdfs in my library had language,isbn
        # most didn't have author, or had the wrong author
        # (author set to publisher, or software used)
        # so probably not much point in looking at pdfs
        #
        # if (extn == ".pdf"):
        #	  pdf = PdfFileReader(open(fname, "rb"))
        #	  txt = pdf.getDocumentInfo()
              # repackage the data here to get components we need
        #     res = {}
        #     for s in ['title','language','creator']:
        #         res[s] = txt[s]
        #	  res['identifier'] = txt['isbn']
        #     res['type'] = "pdf"
        #	  return res

    elif extn == ".epub":
        res['type'] = "epub"

        # prepare to read from the .epub file
        try:
            zipdata = zipfile.ZipFile(fname)
        except:
            return res

        # find the contents metafile
        txt = zipdata.read('META-INF/container.xml')
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        if not len(tree):
           return res

        while n < len(tree[0]):
           att = tree[0][n].attrib
           if 'full-path' in att:
               cfname = att['full-path']
               break
           n = n + 1

        # grab the metadata block from the contents metafile
        txt = zipdata.read(cfname)

    elif extn == ".opf":
        res['type'] = "opf"
        txt = open(fname).read()
        # sanitize any unmatched html tags or ElementTree won't parse
        dic = {'<br>': '', '</br>': ''}
        txt = replace_all(txt, dic)

    # repackage epub or opf metadata
    try:
        tree = ElementTree.fromstring(txt)
    except Exception as e:
        logger.error("Error parsing metadata from %s" % fname)
        logger.error(str(e))
        return res

    if not len(tree):
        return res
    n = 0
    while n < len(tree[0]):
        tag = str(tree[0][n].tag).lower()
        if '}' in tag:
            tag = tag.split('}')[1]
            txt = tree[0][n].text
            attrib = str(tree[0][n].attrib).lower()
            if 'title' in tag:
                res['title'] = txt
            elif 'language' in tag:
                res['language'] = txt
            elif 'creator' in tag:
                res['creator'] = txt
            elif 'identifier' in tag and 'isbn' in attrib:
                if is_valid_isbn(txt):
                    res['identifier'] = txt
            n = n + 1
    return res
Beispiel #3
0
def get_book_info(fname):
    # only handles epub, mobi and opf for now,
    # for pdf see below
    words = fname.split(".")
    extn = words[len(words) - 1]

    if extn == "mobi":
        book = Mobi(fname)
        book.parse()
        res = {}
        res["creator"] = book.author()
        res["title"] = book.title()
        res["language"] = book.language()
        res["identifier"] = book.isbn()
        return res

        # none of the pdfs in my library had language,isbn
        # most didn't have author, or had the wrong author
        # (author set to publisher, or software used)
        # so probably not much point in looking at pdfs
        #
        # if (extn == "pdf"):
        # 	  pdf = PdfFileReader(open(fname, "rb"))
        # 	  txt = pdf.getDocumentInfo()
        # repackage the data here to get components we need
        #     res = {}
        #     for s in ['title','language','creator','isbn']:
        #         res[s] = txt[s]
        # 	  return res

    if extn == "epub":
        # prepare to read from the .epub file
        zip = zipfile.ZipFile(fname)
        # find the contents metafile
        txt = zip.read("META-INF/container.xml")
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        while n < len(tree[0]):
            att = tree[0][n].attrib
            if "full-path" in att:
                cfname = "%s" % att  # extract metadata filename
                cfname = cfname.split(",")[1].split(":")[1].strip("' }")
            n = n + 1

            # grab the metadata block from the contents metafile
        txt = zip.read(cfname)
        tree = ElementTree.fromstring(txt)
    else:
        if extn == "opf":
            txt = open(fname).read()
            tree = ElementTree.fromstring(txt)
        else:
            return ""

    # repackage the data - not too happy with this as there can be
    # several "identifier", only one of which is an isbn, how can we tell?
    # I just strip formatting, check for length, and check is only digits
    # except the last digit of an isbn10 may be 'X'
    res = {}
    n = 0
    while n < len(tree[0]):
        tag = tree[0][n].tag.split("}")[1]
        txt = tree[0][n].text
        isbn = ""
        if "title" in tag.lower():
            res["title"] = txt
        elif "language" in tag.lower():
            res["language"] = txt
        elif "creator" in tag.lower():
            res["creator"] = txt
        elif "identifier" in tag.lower():
            if len(txt) == 13:
                if txt.isdigit():
                    isbn = txt
            elif len(txt) == 10:
                if txt[:8].isdigit():
                    isbn = txt

            res["identifier"] = isbn
        n = n + 1
    return res
Beispiel #4
0
def get_book_info(fname):
    # only handles epub, mobi and opf for now,
    # for pdf see below
    res = {}
    extn = os.path.splitext(fname)[1]
    if not extn:
        return res
    if extn == ".mobi":
        try:
            book = Mobi(fname)
            book.parse()
        except:
            return res
        res["creator"] = book.author()
        res["title"] = book.title()
        res["language"] = book.language()
        res["identifier"] = book.isbn()
        res["type"] = "mobi"
        return res

    """
    # none of the pdfs in my library had language,isbn
    # most didn't have author, or had the wrong author
    # (author set to publisher, or software used)
    # so probably not much point in looking at pdfs
    #
    # if (extn == ".pdf"):
    #	  pdf = PdfFileReader(open(fname, "rb"))
    #	  txt = pdf.getDocumentInfo()
          # repackage the data here to get components we need
    #     res = {}
    #     for s in ['title','language','creator']:
    #         res[s] = txt[s]
    #	  res['identifier'] = txt['isbn']
    #     res['type'] = "pdf"
    #	  return res
    """

    if extn == ".epub":
        # prepare to read from the .epub file
        zipdata = zipfile.ZipFile(fname)
        # find the contents metafile
        txt = zipdata.read("META-INF/container.xml")
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        if not len(tree):
            return res

        while n < len(tree[0]):
            att = tree[0][n].attrib
            if "full-path" in att:
                cfname = att["full-path"]
            n = n + 1

        # grab the metadata block from the contents metafile
        txt = zipdata.read(cfname)
        tree = ElementTree.fromstring(txt)
        res["type"] = "epub"
    else:
        if extn == ".opf":
            txt = open(fname).read()
            tree = ElementTree.fromstring(txt)
            res["type"] = "opf"
        else:
            return ""

    # repackage the data
    if not len(tree):
        return res
    n = 0
    while n < len(tree[0]):
        tag = str(tree[0][n].tag).split("}")[1]
        txt = tree[0][n].text
        attrib = str(tree[0][n].attrib)
        isbn = ""
        if "title" in tag.lower():
            res["title"] = txt
        elif "language" in tag.lower():
            res["language"] = txt
        elif "creator" in tag.lower():
            res["creator"] = txt
        elif "identifier" in tag.lower() and "isbn" in attrib.lower():
            if formatter.is_valid_isbn(txt):
                res["identifier"] = txt
        n = n + 1
    return res
Beispiel #5
0
def get_book_info(fname):
    # only handles epub, mobi, azw3 and opf for now,
    # for pdf see notes below
    res = {}
    extn = os.path.splitext(fname)[1]
    if not extn:
        return res
    if extn == ".mobi" or extn == ".azw3":
        res['type'] = extn[1:]
        try:
            book = Mobi(fname)
            book.parse()
        except Exception as e:
            logger.debug('Unable to parse mobi in %s, %s' % (fname, str(e)))
            return res
        res['creator'] = book.author()
        res['title'] = book.title()
        res['language'] = book.language()
        res['identifier'] = book.isbn()
        return res
        """
        # none of the pdfs in my library had language,isbn
        # most didn't have author, or had the wrong author
        # (author set to publisher, or software used)
        # so probably not much point in looking at pdfs
        #
        if (extn == ".pdf"):
            pdf = PdfFileReader(open(fname, "rb"))
            txt = pdf.getDocumentInfo()
            # repackage the data here to get components we need
            res = {}
            for s in ['title','language','creator']:
                res[s] = txt[s]
            res['identifier'] = txt['isbn']
            res['type'] = "pdf"
            return res
        """
    elif extn == ".epub":
        res['type'] = "epub"

        # prepare to read from the .epub file
        try:
            zipdata = zipfile.ZipFile(fname)
        except Exception as e:
            logger.debug('Unable to parse zipfile %s, %s' % (fname, str(e)))
            return res

        # find the contents metafile
        txt = zipdata.read('META-INF/container.xml')
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        if not len(tree):
            return res

        while n < len(tree[0]):
            att = tree[0][n].attrib
            if 'full-path' in att:
                cfname = att['full-path']
                break
            n = n + 1

        # grab the metadata block from the contents metafile
        txt = zipdata.read(cfname)

    elif extn == ".opf":
        res['type'] = "opf"
        txt = open(fname).read()
        # sanitize any unmatched html tags or ElementTree won't parse
        dic = {'<br>': '', '</br>': ''}
        txt = replace_all(txt, dic)

    # repackage epub or opf metadata
    try:
        tree = ElementTree.fromstring(txt)
    except Exception as e:
        logger.error("Error parsing metadata from %s, %s" % (fname, str(e)))
        return res

    if not len(tree):
        return res
    n = 0
    while n < len(tree[0]):
        tag = str(tree[0][n].tag).lower()
        if '}' in tag:
            tag = tag.split('}')[1]
            txt = tree[0][n].text
            attrib = str(tree[0][n].attrib).lower()
            if 'title' in tag:
                res['title'] = txt
            elif 'language' in tag:
                res['language'] = txt
            elif 'creator' in tag:
                res['creator'] = txt
            elif 'identifier' in tag and 'isbn' in attrib:
                if is_valid_isbn(txt):
                    res['identifier'] = txt
        n = n + 1
    return res
def get_book_info(fname):
    # only handles epub, mobi and opf for now,
    # for pdf see below
    res = {}
    extn = os.path.splitext(fname)[1]
    if not extn:
        return res
    if extn == ".mobi":
        try:
            book = Mobi(fname)
            book.parse()
        except:
            return res
        res['creator'] = book.author()
        res['title'] = book.title()
        res['language'] = book.language()
        res['identifier'] = book.isbn()
        res['type'] = "mobi"
        return res
    """
    # none of the pdfs in my library had language,isbn
    # most didn't have author, or had the wrong author
    # (author set to publisher, or software used)
    # so probably not much point in looking at pdfs
    #
    # if (extn == ".pdf"):
    #	  pdf = PdfFileReader(open(fname, "rb"))
    #	  txt = pdf.getDocumentInfo()
          # repackage the data here to get components we need
    #     res = {}
    #     for s in ['title','language','creator']:
    #         res[s] = txt[s]
    #	  res['identifier'] = txt['isbn']
    #     res['type'] = "pdf"
    #	  return res
    """

    if extn == ".epub":
        # prepare to read from the .epub file
        zipdata = zipfile.ZipFile(fname)
        # find the contents metafile
        txt = zipdata.read('META-INF/container.xml')
        tree = ElementTree.fromstring(txt)
        n = 0
        cfname = ""
        if not len(tree):
            return res

        while n < len(tree[0]):
            att = tree[0][n].attrib
            if 'full-path' in att:
                cfname = att['full-path']
            n = n + 1

        # grab the metadata block from the contents metafile
        txt = zipdata.read(cfname)
        tree = ElementTree.fromstring(txt)
        res['type'] = "epub"
    else:
        if extn == ".opf":
            txt = open(fname).read()
            tree = ElementTree.fromstring(txt)
            res['type'] = "opf"
        else:
            return ""

    # repackage the data
    if not len(tree):
        return res
    n = 0
    while n < len(tree[0]):
        tag = str(tree[0][n].tag).split('}')[1]
        txt = tree[0][n].text
        attrib = str(tree[0][n].attrib)
        isbn = ""
        if 'title' in tag.lower():
            res['title'] = txt
        elif 'language' in tag.lower():
            res['language'] = txt
        elif 'creator' in tag.lower():
            res['creator'] = txt
        elif 'identifier' in tag.lower() and 'isbn' in attrib.lower():
            if formatter.is_valid_isbn(txt):
                res['identifier'] = txt
        n = n + 1
    return res