Ejemplo n.º 1
0
def BookParse(bookid, pages=None, exclude=None):
    """ Takes id of book to parse. Id of book is one from DB,
    and should correspond to filename as book12.pdf,
    for id of the book in DB is 12. Also function accepts
    optional argument "pages", it defines pages to parse, and
    optional argument "exclude", to define pages to exclude.
    Range format accepted: 1,2,3-8,15
    """
    if pages is None:
        pages = set()
    if exclude is None:
        exclude = set()

    try:
        bookfile = open("data/book" + str(bookid) + ".pdf", "rb")
    except FileNotFoundError as e:
        exception_msg(lg, e
                        , level="ERR"
                        , text="No such book (id=%s) in data dir."\
                                 % str(bookid))
        raise

    mineparser = PDFParser(bookfile)
    document = PDFDocument(mineparser)
    if not document.is_extractable:
        lg.error("PDF text extraction is not allowed.")
        raise PDFTextExtractionNotAllowed

    db = DBManager()

    for pagenum, page in enumerate(PDFPage.create_pages(document)):
        realnum = pagenum + 1
        lg.info("Working on page %s (bookid=%s)", str(realnum), str(bookid))
        if (len(pages) > 0 and realnum not in pages)\
           or realnum in exclude:
            lg.info("Page %s (bookid=%s) excluded.", str(realnum), str(bookid))
            continue

        # Insert page entry to db, no HTML
        db.insert_page(bookid, realnum)

        lg.info("Recognizing (pagenum=%s) of book (id=%s).", str(realnum),
                str(bookid))
        pagetype = recognize(bookid, page)

        if pagetype == -1:
            lg.warning("Can't recognize page (pagenum=%s) in book (id=%s).",
                       str(realnum), str(bookid))
            lg.info("Page %s (bookid=%s) skipped.", str(realnum), str(bookid))
            continue

        lg.info("Parsing (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
        try:
            data = parse(bookid, page, pagetype)
        except Exception as e:
            exception_msg(lg, e
                            , level="WARN"
                            , text="Errors while parsing."
                                   " Skip (pagenum=%s) of book (id=%s)"\
                                    % (str(realnum), str(bookid)))
            continue
        else:
            lg.info(
                "Inserting items to DB."
                " (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
            try:
                db.bulk_insert(bookid, data, pnum=realnum)
            except Exception as e:
                exception_msg(lg,
                              e,
                              level="ERR",
                              text="Errors during inserting data into DB."
                              " Maybe you should check the parser")

        # Update page entry with parsed HTML
        lg.info("Parsing to HTML (pagenum=%s) of book (id=%s).", str(realnum),
                str(bookid))
        try:
            html = pdftohtml(page)
        except Exception as e:
            exception_msg(lg, e
                            , text="Cannot convert PDF to HTML."
                                   " (pagenum=%s) of book (id=%s)"\
                                   % (str(realnum), str(bookid)))
        else:
            lg.info(
                "Inserting HTML to DB."
                " (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
            db.insert_page(bookid, realnum, data=html)

        lg.info(
            "Done with page."
            " (pagenum=%s) of book (id=%s). Type (pagetype=%s).", str(realnum),
            str(bookid), str(pagetype))