Ejemplo n.º 1
0
def tesseract(zone):
    "run the tesseract ocr engine on Image zone"
    #So we can run this function simultaneously from
    #multiple processes without fear of collisions
    badge = uuid.uuid4().hex
    ft = "/tmp/region-" + badge
    try:
        zone.save(ft + ".tif")
        p = subprocess.Popen(
            [
                "/usr/local/bin/tesseract", #XXX location should be in cfg 
                ft + ".tif", 
                ft
            ],
            stdin  = _devnull,
            stdout = _devnull,
            stderr = subprocess.PIPE
        )
        err = p.stderr.read()
        sts = os.waitpid(p.pid, 0)[1]
        if sts != 0 or len(err) > 100:
            log.error(err)
            raise OCRException("OCR failed")
        text = util.readfrom(ft + ".txt")
    finally:
        for p in (".tif", ".txt"):
            util.rmf(ft + p)
    return "".join(c for c in text if ord(c)<128)
Ejemplo n.º 2
0
 def __init__(self, location):
     self.cache = {}
     self.location = location
     util.mkdirp(location)
     self.log = logging.getLogger('')
     #attempt to prepopulate cache
     try:
         for file in os.listdir(location):
             # Mitch 1/11/2011 really want != .xml
             if os.path.splitext(file)[1] == ".jpg":
                 continue
             rfile = os.path.join(location, file)
             data = util.readfrom(rfile,
                                  "<")  #default to text that will not parse
             try:
                 tmpl = BallotTemplate.Template_from_XML(data)
             except ExpatError:
                 if data != "<":
                     self.log.exception("Could not parse " + file)
                 continue
             fname = os.path.basename(file)
             self.cache[fname] = tmpl
     except OSError:
         self.log.info("No templates found")
Ejemplo n.º 3
0
def main():
    NextEqualsPrefix = "Next="
    MorePrompt = ":"
    NextToProcessFile = ""
    miss_counter = 0
    # get command line arguments
    cfg_file = get_args()

    # read configuration from tevs.cfg and set constants for this run
    config.get(cfg_file)
    util.mkdirp(const.root)
    log = config.logger(util.root("extraction.log"))

    # create initial toplevel directories if they don't exist
    for p in ("%s" % ("templates"), "%s" % ("template_images"),
              "%s" % ("composite_images"), "results", "proc", "errors"):
        util.mkdirp(util.root(p))

    # make sure you have code for ballot type spec'd in config file
    try:
        ballotfrom = Ballot.LoadBallotType(const.layout_brand)
    except KeyError as e:
        util.fatal("No such ballot type: %s check %s !",
                   (const.layout_brand, cfg_file))

    cache = Ballot.TemplateCache(util.root("templates"))
    extensions = Ballot.Extensions(template_cache=cache)

    # connect to db and open cursor
    if const.use_db:
        try:
            dbc = db.PostgresDB(database=const.dbname, user=const.dbuser)
        except db.DatabaseError:
            util.fatal("Could not connect to database!")
    else:
        dbc = db.NullDB()
    log.info("Database connected.")

    total_images_processed, total_images_left_unprocessed = 0, 0
    base = os.path.basename
    # Each time given a signal to proceed for count_to_process ballots,
    # create ballot from images, get landmarks, get layout code, get votes.
    # Write votes to database and results directory.
    # for profiling
    # from guppy import hpy;hp=hpy();hp.setref();
    # import gc;gc.disable();gc.collect();hp.setref()

    NextToProcessFile = util.root("nexttoprocess.txt")
    count_to_process = 0
    file_problem = False
    while True:
        log.debug("Top of loop.")
        next_ballot_number = int(util.readfrom(NextToProcessFile))
        log.debug("Read %d from %s" % (next_ballot_number, NextToProcessFile))
        if count_to_process == 0:
            # send prompt to controlling process, "READY:" or "+ for SKIP:"
            if file_problem:
                file_problem = False
                # do not remove space after %06d
                print "Next=%06d , + to SKIP:" % (next_ballot_number, )
            else:
                # do not remove space after %06d
                print "Next=%06d , READY:" % (next_ballot_number, )
            sys.stdout.flush()
            # wait here until get_count_to_process returns
            # it will wait on input instruction from stdio
            try:
                count_to_process = get_count_to_process(
                    next_ballot_number, log)
            except DoIncrementException, e:
                log.debug("Do increment exception")
                util.writeto(NextToProcessFile,
                             next_ballot_number + const.num_pages)
                log.debug(
                    "Wrote %d to next_ballot_number, count to process is %d" %
                    (next_ballot_number + const.num_pages, count_to_process))
                count_to_process = 0
                log.debug("Setting count to process to 0.")
                continue
            # we're done when we get instructed to process 0
            if count_to_process == 0:
                break
        count_to_process -= 1
        try:
            # get number of next image,
            # clean up, in case...
            gc.collect()
            log.debug("Request for %d" % (next_ballot_number, ))
            unprocs = [
                incomingn(next_ballot_number + m)
                for m in range(const.num_pages)
            ]
            log.info(unprocs)
            # we need all images for sheet to be available to process it
            for filename in unprocs:
                log.info("Checking for path.")
                if not os.path.exists(filename):
                    log.info("File not present.")
                    errmsg = "File %s not present or available!!!" % (
                        base(filename), )
                    log.info(errmsg.replace("!!!", ""))
                    print errmsg
                    sys.stdout.flush()
                    raise FileNotPresentException(filename)
                log.info("Path found.")
        #Processing
            log.debug("Creating ballot.")
            try:
                ballot = ballotfrom(unprocs, extensions)
                log.debug("Created ballot, processing.")
                results = ballot.ProcessPages()
                log.debug("Processed.")
            except BallotException as e:
                total_images_left_unprocessed += mark_error(e, *unprocs)
                log.exception("Could not process ballot")
                util.writeto(NextToProcessFile,
                             next_ballot_number + const.num_pages)
                continue

            #Write all data
            #make dirs:
            proc1d = dirn("proc", next_ballot_number)
            resultsd = dirn("results", next_ballot_number)

            resultsfilename = filen(resultsd, next_ballot_number)
            for p in (proc1d, resultsd):
                util.mkdirp(p)
            #try:
            #    results_to_vop_files(results,resultsfilename)
            #except Exception as e:
            #    log.info(e)
            #    print e
            #write csv and mosaic
            #log.info("local results_to_CSV")
            #csv = results_to_CSV(results,log)
            #log.info("Back from results_to_CSV")
            #util.genwriteto(resultsfilename + ".csv", csv)
            #write to the database
            try:
                log.debug("Inserting to db")
                dbc.insert(ballot)
            except db.DatabaseError:
                #dbc does not commit if there is an error, just need to remove
                #partial files
                remove_partial(resultsfilename + ".txt")
                remove_partial(resultsfilename + const.filename_extension)
                log.info("Could not commit to db")
                print "Could not commit to db!"
                util.fatal("Could not commit vote information to database")

            #Post-processing

            # move the images from unproc to proc
            log.debug("Renaming")
            procs = [
                filen(proc1d, next_ballot_number + m) +
                const.filename_extension for m in range(const.num_pages)
            ]
            for a, b in zip(unprocs, procs):
                try:
                    os.rename(a, b)
                except OSError as e:
                    log.info("Could not rename %s" % a)
                    util.fatal("Could not rename %s", a)
            total_images_processed += const.num_pages
            # Tell caller you've processed all images of this ballot
            log.debug("Requesting next")
            util.writeto(NextToProcessFile,
                         next_ballot_number + const.num_pages)
            # update next ballot file with next image number
            log.debug("Done writing nexttoprocess.txt")
            #print "%d extracted. " % (next_ballot_number,)

            log.info("%d images processed", const.num_pages)

            # for profiling
            # hp.heap().dump('prof.hpy');hp.setref();gc.collect();
            # hp.setref();hp.heap().dump('prof.hpy')
        except FileNotPresentException, e:
            file_problem = True
            print "FileNotPresentException"
            sys.stdout.flush()
            log.info("FileNotPresentException occurred")
            continue
Ejemplo n.º 4
0
def main():
    miss_counter = 0
    # get command line arguments
    cfg_file = get_args()

    # read configuration from tevs.cfg and set constants for this run
    config.get(cfg_file)
    util.mkdirp(const.root)
    log = config.logger(const.logfilename)
    log.info("Log created.")
    # create initial toplevel directories if they don't exist
    for p in ("%s" % ("templates"), "%s" % ("template_images"),
              "%s" % ("composite_images"), "results", "proc", "errors"):
        util.mkdirp(util.root(p))

    # make sure you have code for ballot type spec'd in config file
    try:
        ballotfrom = Ballot.LoadBallotType(const.layout_brand)
    except KeyError as e:
        util.fatal("No such ballot type: " + const.layout_brand + ": check " +
                   cfg_file)

    cache = Ballot.TemplateCache(util.root("templates"))
    extensions = Ballot.Extensions(template_cache=cache)

    # connect to db and open cursor
    if const.use_db:
        try:
            dbc = db.PostgresDB(database=const.dbname, user=const.dbuser)
        except db.DatabaseError:
            util.fatal("Could not connect to database")
    else:
        dbc = db.NullDB()
    log.info("Database connected.")

    total_images_processed, total_images_left_unprocessed = 0, 0
    base = os.path.basename
    # Each time given a signal to proceed for count_to_process ballots,
    # create ballot from images, get landmarks, get layout code, get votes.
    # Write votes to database and results directory.
    # for profiling
    # from guppy import hpy;hp=hpy();hp.setref();
    # import gc;gc.disable();gc.collect();hp.setref()

    count_to_process = 0
    while True:
        next_ballot_number = int(util.readfrom(util.root("nexttoprocess.txt")))
        if count_to_process == 0:
            # wait here until get_count_to_process returns
            # it will wait on input instruction from stdio
            processing_command = get_processing_command(next_ballot_number)
            if processing_command.startswith("+"):
                next_ballot_number += const.num_pages
                util.writeto(util.root("nexttoprocess.txt"),
                             next_ballot_number)
                count_to_process = 1
            if processing_command.startswith("="):
                next_ballot_number = int(processing_command[1:])
                util.writeto(util.root("nexttoprocess.txt"),
                             next_ballot_number)
                count_to_process = 1
            if processing_command.startswith("S"):
                count_to_process = 1
            if processing_command.startswith("0"):
                count_to_process = 0
            # we're done when we get instructed to process 0
            if count_to_process == 0:
                break
        count_to_process -= 1
        try:
            # get number of next image,
            # clean up, in case...
            gc.collect()
            log.debug("Request for %d" % (next_ballot_number, ))
            unprocs = [
                incomingn(next_ballot_number + m)
                for m in range(const.num_pages)
            ]
            log.info(unprocs)
            # we need all images for sheet to be available to process it
            for filename in unprocs:
                if not os.path.exists(filename):
                    errmsg = "File %s not present or available!" % (
                        base(filename), )
                    log.info(errmsg)
                    # if a file is not yet available, that's not fatal
                    raise FileNotPresentException(errmsg)

            #Processing

            #log.info("Processing %s:\n %s" %
            #    (n, "\n".join("\t%s" % base(u) for u in unprocs))
            #)
            log.debug("Creating ballot.")
            try:
                ballot = ballotfrom(unprocs, extensions)
                log.debug("Created ballot, processing.")
                results = ballot.ProcessPages()
                log.debug("Processed.")
            except BallotException as e:
                total_images_left_unprocessed += mark_error(e, *unprocs)
                log.exception("Could not process ballot")
                continue

            #Write all data
            #make dirs:
            proc1d = dirn("proc", next_ballot_number)
            resultsd = dirn("results", next_ballot_number)

            resultsfilename = filen(resultsd, next_ballot_number)
            for p in (proc1d, resultsd):
                util.mkdirp(p)
            #try:
            #    results_to_vop_files(results,resultsfilename)
            #except Exception as e:
            #    log.info(e)
            #    print e
            #write csv and mosaic
            #log.info("local results_to_CSV")
            #csv = results_to_CSV(results,log)
            #log.info("Back from results_to_CSV")
            #util.genwriteto(resultsfilename + ".csv", csv)
            #write to the database
            try:
                log.debug("Inserting to db")
                dbc.insert(ballot)
            except db.DatabaseError:
                #dbc does not commit if there is an error, just need to remove
                #partial files
                remove_partial(resultsfilename + ".txt")
                remove_partial(resultsfilename + const.filename_extension)
                log.info("Could not commit to db")
                print "Could not commit to db!"
                util.fatal("Could not commit vote information to database")

            #Post-processing

            # move the images from unproc to proc
            log.debug("Renaming")
            procs = [
                filen(proc1d, next_ballot_number + m) +
                const.filename_extension for m in range(const.num_pages)
            ]
            for a, b in zip(unprocs, procs):
                try:
                    os.rename(a, b)
                except OSError as e:
                    log.info("Could not rename %s" % a)
                    util.fatal("Could not rename %s", a)
            total_images_processed += const.num_pages
            # Tell caller you've processed all images of this ballot
            log.debug("Requesting next")
            util.writeto(util.root("nexttoprocess.txt"),
                         next_ballot_number + const.num_pages)
            # update next ballot file with next image number
            log.debug("Done writing nexttoprocess.txt")
            #print "%d extracted. " % (next_ballot_number,)

            log.info("%d images processed", const.num_pages)

            # for profiling
            # hp.heap().dump('prof.hpy');hp.setref();gc.collect();
            # hp.setref();hp.heap().dump('prof.hpy')
        except FileNotPresentException, e:
            print e
            sys.stdout.flush()
Ejemplo n.º 5
0
Archivo: next.py Proyecto: wrishel/tevs
 def __init__(self, next_file, inc):
     self.inc = inc
     self.next_file = next_file
     self.next = int(util.readfrom(next_file, 1))