Example #1
0
def test_parse():
    claim = tools.parse('#1 @ 2,3: 5x4')
    assert claim.id == 1
    assert claim.left == 2
    assert claim.top == 3
    assert claim.width == 5
    assert claim.height == 4
Example #2
0
def test_sample():
    inputs = [
        '#1 @ 1,3: 4x4',
        '#2 @ 3,1: 4x4',
        '#3 @ 5,5: 2x2',
    ]
    mapa = defaultdict(int)
    for s in inputs:
        claim = tools.parse(s)
        for (x, y) in claim:
            mapa[(x, y)] += 1
    tools.print_mapa(mapa)
Example #3
0
def BookParse(bookid, pages=None, exclude=None):
    """ Takes id of book to parse. Id of book is one from DB,
    and should correspond to filename as book12.pdf,
    for id of the book in DB is 12. Also function accepts
    optional argument "pages", it defines pages to parse, and
    optional argument "exclude", to define pages to exclude.
    Range format accepted: 1,2,3-8,15
    """
    if pages is None:
        pages = set()
    if exclude is None:
        exclude = set()

    try:
        bookfile = open("data/book" + str(bookid) + ".pdf", "rb")
    except FileNotFoundError as e:
        exception_msg(lg, e
                        , level="ERR"
                        , text="No such book (id=%s) in data dir."\
                                 % str(bookid))
        raise

    mineparser = PDFParser(bookfile)
    document = PDFDocument(mineparser)
    if not document.is_extractable:
        lg.error("PDF text extraction is not allowed.")
        raise PDFTextExtractionNotAllowed

    db = DBManager()

    for pagenum, page in enumerate(PDFPage.create_pages(document)):
        realnum = pagenum + 1
        lg.info("Working on page %s (bookid=%s)", str(realnum), str(bookid))
        if (len(pages) > 0 and realnum not in pages)\
           or realnum in exclude:
            lg.info("Page %s (bookid=%s) excluded.", str(realnum), str(bookid))
            continue

        # Insert page entry to db, no HTML
        db.insert_page(bookid, realnum)

        lg.info("Recognizing (pagenum=%s) of book (id=%s).", str(realnum),
                str(bookid))
        pagetype = recognize(bookid, page)

        if pagetype == -1:
            lg.warning("Can't recognize page (pagenum=%s) in book (id=%s).",
                       str(realnum), str(bookid))
            lg.info("Page %s (bookid=%s) skipped.", str(realnum), str(bookid))
            continue

        lg.info("Parsing (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
        try:
            data = parse(bookid, page, pagetype)
        except Exception as e:
            exception_msg(lg, e
                            , level="WARN"
                            , text="Errors while parsing."
                                   " Skip (pagenum=%s) of book (id=%s)"\
                                    % (str(realnum), str(bookid)))
            continue
        else:
            lg.info(
                "Inserting items to DB."
                " (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
            try:
                db.bulk_insert(bookid, data, pnum=realnum)
            except Exception as e:
                exception_msg(lg,
                              e,
                              level="ERR",
                              text="Errors during inserting data into DB."
                              " Maybe you should check the parser")

        # Update page entry with parsed HTML
        lg.info("Parsing to HTML (pagenum=%s) of book (id=%s).", str(realnum),
                str(bookid))
        try:
            html = pdftohtml(page)
        except Exception as e:
            exception_msg(lg, e
                            , text="Cannot convert PDF to HTML."
                                   " (pagenum=%s) of book (id=%s)"\
                                   % (str(realnum), str(bookid)))
        else:
            lg.info(
                "Inserting HTML to DB."
                " (pagenum=%s) of book (id=%s). Type (pagetype=%s).",
                str(realnum), str(bookid), str(pagetype))
            db.insert_page(bookid, realnum, data=html)

        lg.info(
            "Done with page."
            " (pagenum=%s) of book (id=%s). Type (pagetype=%s).", str(realnum),
            str(bookid), str(pagetype))
Example #4
0
import os

user = getpass.getuser()

import socket
computer_name = socket.gethostname()

print '''
#-----------------------------------------------------------------------------#
# Shell4Win                                                                   #
#                                                                             #
# This is an open source shell interpreter                                    #
# it is made for system administrators who are used to shell syntax and       #
# need to deal with Windows from now and then. it enables you to use shell    #
# commands and scripts under Windows environment.                             #
#                                                                             #
# author:log4leo  https://github.com/log4leo/Shell4Win                        #
# license: BSD                                                                #
#                                                                             #
#-----------------------------------------------------------------------------#
'''
print "[Current directory]" + os.getcwd()
while 1:
    try:
        s = raw_input('[' + user + '@' + computer_name + ']#')
    except EOFError:
        break
    if s.startswith('#'): continue
    if not s: continue
    tools.parse(s)
def sh(fn):
    f=open(fn,"r")
    for l in f:
        tools.parse(l.strip())
Example #6
0
def test_print_mapa():
    mapa = defaultdict(int)
    claim = tools.parse('#1 @ 2,3: 5x4')
    for (x, y) in claim:
        mapa[(x, y)] += 1
    tools.print_mapa(mapa)
Example #7
0
def test_iterator():
    claim = tools.parse('#1 @ 2,3: 5x4')
    for coord in claim:
        print(coord)
Example #8
0
def sh(fn):
    f=open(fn,"r")
    for l in f:
        tools.parse(l.strip())