def tesseract(zone): "run the tesseract ocr engine on Image zone" #So we can run this function simultaneously from #multiple processes without fear of collisions badge = uuid.uuid4().hex ft = "/tmp/region-" + badge try: zone.save(ft + ".tif") p = subprocess.Popen( [ "/usr/local/bin/tesseract", #XXX location should be in cfg ft + ".tif", ft ], stdin = _devnull, stdout = _devnull, stderr = subprocess.PIPE ) err = p.stderr.read() sts = os.waitpid(p.pid, 0)[1] if sts != 0 or len(err) > 100: log.error(err) raise OCRException("OCR failed") text = util.readfrom(ft + ".txt") finally: for p in (".tif", ".txt"): util.rmf(ft + p) return "".join(c for c in text if ord(c)<128)
def __init__(self, location): self.cache = {} self.location = location util.mkdirp(location) self.log = logging.getLogger('') #attempt to prepopulate cache try: for file in os.listdir(location): # Mitch 1/11/2011 really want != .xml if os.path.splitext(file)[1] == ".jpg": continue rfile = os.path.join(location, file) data = util.readfrom(rfile, "<") #default to text that will not parse try: tmpl = BallotTemplate.Template_from_XML(data) except ExpatError: if data != "<": self.log.exception("Could not parse " + file) continue fname = os.path.basename(file) self.cache[fname] = tmpl except OSError: self.log.info("No templates found")
def main(): NextEqualsPrefix = "Next=" MorePrompt = ":" NextToProcessFile = "" miss_counter = 0 # get command line arguments cfg_file = get_args() # read configuration from tevs.cfg and set constants for this run config.get(cfg_file) util.mkdirp(const.root) log = config.logger(util.root("extraction.log")) # create initial toplevel directories if they don't exist for p in ("%s" % ("templates"), "%s" % ("template_images"), "%s" % ("composite_images"), "results", "proc", "errors"): util.mkdirp(util.root(p)) # make sure you have code for ballot type spec'd in config file try: ballotfrom = Ballot.LoadBallotType(const.layout_brand) except KeyError as e: util.fatal("No such ballot type: %s check %s !", (const.layout_brand, cfg_file)) cache = Ballot.TemplateCache(util.root("templates")) extensions = Ballot.Extensions(template_cache=cache) # connect to db and open cursor if const.use_db: try: dbc = db.PostgresDB(database=const.dbname, user=const.dbuser) except db.DatabaseError: util.fatal("Could not connect to database!") else: dbc = db.NullDB() log.info("Database connected.") total_images_processed, total_images_left_unprocessed = 0, 0 base = os.path.basename # Each time given a signal to proceed for count_to_process ballots, # create ballot from images, get landmarks, get layout code, get votes. # Write votes to database and results directory. # for profiling # from guppy import hpy;hp=hpy();hp.setref(); # import gc;gc.disable();gc.collect();hp.setref() NextToProcessFile = util.root("nexttoprocess.txt") count_to_process = 0 file_problem = False while True: log.debug("Top of loop.") next_ballot_number = int(util.readfrom(NextToProcessFile)) log.debug("Read %d from %s" % (next_ballot_number, NextToProcessFile)) if count_to_process == 0: # send prompt to controlling process, "READY:" or "+ for SKIP:" if file_problem: file_problem = False # do not remove space after %06d print "Next=%06d , + to SKIP:" % (next_ballot_number, ) else: # do not remove space after %06d print "Next=%06d , READY:" % (next_ballot_number, ) sys.stdout.flush() # wait here until get_count_to_process returns # it will wait on input instruction from stdio try: count_to_process = get_count_to_process( next_ballot_number, log) except DoIncrementException, e: log.debug("Do increment exception") util.writeto(NextToProcessFile, next_ballot_number + const.num_pages) log.debug( "Wrote %d to next_ballot_number, count to process is %d" % (next_ballot_number + const.num_pages, count_to_process)) count_to_process = 0 log.debug("Setting count to process to 0.") continue # we're done when we get instructed to process 0 if count_to_process == 0: break count_to_process -= 1 try: # get number of next image, # clean up, in case... gc.collect() log.debug("Request for %d" % (next_ballot_number, )) unprocs = [ incomingn(next_ballot_number + m) for m in range(const.num_pages) ] log.info(unprocs) # we need all images for sheet to be available to process it for filename in unprocs: log.info("Checking for path.") if not os.path.exists(filename): log.info("File not present.") errmsg = "File %s not present or available!!!" % ( base(filename), ) log.info(errmsg.replace("!!!", "")) print errmsg sys.stdout.flush() raise FileNotPresentException(filename) log.info("Path found.") #Processing log.debug("Creating ballot.") try: ballot = ballotfrom(unprocs, extensions) log.debug("Created ballot, processing.") results = ballot.ProcessPages() log.debug("Processed.") except BallotException as e: total_images_left_unprocessed += mark_error(e, *unprocs) log.exception("Could not process ballot") util.writeto(NextToProcessFile, next_ballot_number + const.num_pages) continue #Write all data #make dirs: proc1d = dirn("proc", next_ballot_number) resultsd = dirn("results", next_ballot_number) resultsfilename = filen(resultsd, next_ballot_number) for p in (proc1d, resultsd): util.mkdirp(p) #try: # results_to_vop_files(results,resultsfilename) #except Exception as e: # log.info(e) # print e #write csv and mosaic #log.info("local results_to_CSV") #csv = results_to_CSV(results,log) #log.info("Back from results_to_CSV") #util.genwriteto(resultsfilename + ".csv", csv) #write to the database try: log.debug("Inserting to db") dbc.insert(ballot) except db.DatabaseError: #dbc does not commit if there is an error, just need to remove #partial files remove_partial(resultsfilename + ".txt") remove_partial(resultsfilename + const.filename_extension) log.info("Could not commit to db") print "Could not commit to db!" util.fatal("Could not commit vote information to database") #Post-processing # move the images from unproc to proc log.debug("Renaming") procs = [ filen(proc1d, next_ballot_number + m) + const.filename_extension for m in range(const.num_pages) ] for a, b in zip(unprocs, procs): try: os.rename(a, b) except OSError as e: log.info("Could not rename %s" % a) util.fatal("Could not rename %s", a) total_images_processed += const.num_pages # Tell caller you've processed all images of this ballot log.debug("Requesting next") util.writeto(NextToProcessFile, next_ballot_number + const.num_pages) # update next ballot file with next image number log.debug("Done writing nexttoprocess.txt") #print "%d extracted. " % (next_ballot_number,) log.info("%d images processed", const.num_pages) # for profiling # hp.heap().dump('prof.hpy');hp.setref();gc.collect(); # hp.setref();hp.heap().dump('prof.hpy') except FileNotPresentException, e: file_problem = True print "FileNotPresentException" sys.stdout.flush() log.info("FileNotPresentException occurred") continue
def main(): miss_counter = 0 # get command line arguments cfg_file = get_args() # read configuration from tevs.cfg and set constants for this run config.get(cfg_file) util.mkdirp(const.root) log = config.logger(const.logfilename) log.info("Log created.") # create initial toplevel directories if they don't exist for p in ("%s" % ("templates"), "%s" % ("template_images"), "%s" % ("composite_images"), "results", "proc", "errors"): util.mkdirp(util.root(p)) # make sure you have code for ballot type spec'd in config file try: ballotfrom = Ballot.LoadBallotType(const.layout_brand) except KeyError as e: util.fatal("No such ballot type: " + const.layout_brand + ": check " + cfg_file) cache = Ballot.TemplateCache(util.root("templates")) extensions = Ballot.Extensions(template_cache=cache) # connect to db and open cursor if const.use_db: try: dbc = db.PostgresDB(database=const.dbname, user=const.dbuser) except db.DatabaseError: util.fatal("Could not connect to database") else: dbc = db.NullDB() log.info("Database connected.") total_images_processed, total_images_left_unprocessed = 0, 0 base = os.path.basename # Each time given a signal to proceed for count_to_process ballots, # create ballot from images, get landmarks, get layout code, get votes. # Write votes to database and results directory. # for profiling # from guppy import hpy;hp=hpy();hp.setref(); # import gc;gc.disable();gc.collect();hp.setref() count_to_process = 0 while True: next_ballot_number = int(util.readfrom(util.root("nexttoprocess.txt"))) if count_to_process == 0: # wait here until get_count_to_process returns # it will wait on input instruction from stdio processing_command = get_processing_command(next_ballot_number) if processing_command.startswith("+"): next_ballot_number += const.num_pages util.writeto(util.root("nexttoprocess.txt"), next_ballot_number) count_to_process = 1 if processing_command.startswith("="): next_ballot_number = int(processing_command[1:]) util.writeto(util.root("nexttoprocess.txt"), next_ballot_number) count_to_process = 1 if processing_command.startswith("S"): count_to_process = 1 if processing_command.startswith("0"): count_to_process = 0 # we're done when we get instructed to process 0 if count_to_process == 0: break count_to_process -= 1 try: # get number of next image, # clean up, in case... gc.collect() log.debug("Request for %d" % (next_ballot_number, )) unprocs = [ incomingn(next_ballot_number + m) for m in range(const.num_pages) ] log.info(unprocs) # we need all images for sheet to be available to process it for filename in unprocs: if not os.path.exists(filename): errmsg = "File %s not present or available!" % ( base(filename), ) log.info(errmsg) # if a file is not yet available, that's not fatal raise FileNotPresentException(errmsg) #Processing #log.info("Processing %s:\n %s" % # (n, "\n".join("\t%s" % base(u) for u in unprocs)) #) log.debug("Creating ballot.") try: ballot = ballotfrom(unprocs, extensions) log.debug("Created ballot, processing.") results = ballot.ProcessPages() log.debug("Processed.") except BallotException as e: total_images_left_unprocessed += mark_error(e, *unprocs) log.exception("Could not process ballot") continue #Write all data #make dirs: proc1d = dirn("proc", next_ballot_number) resultsd = dirn("results", next_ballot_number) resultsfilename = filen(resultsd, next_ballot_number) for p in (proc1d, resultsd): util.mkdirp(p) #try: # results_to_vop_files(results,resultsfilename) #except Exception as e: # log.info(e) # print e #write csv and mosaic #log.info("local results_to_CSV") #csv = results_to_CSV(results,log) #log.info("Back from results_to_CSV") #util.genwriteto(resultsfilename + ".csv", csv) #write to the database try: log.debug("Inserting to db") dbc.insert(ballot) except db.DatabaseError: #dbc does not commit if there is an error, just need to remove #partial files remove_partial(resultsfilename + ".txt") remove_partial(resultsfilename + const.filename_extension) log.info("Could not commit to db") print "Could not commit to db!" util.fatal("Could not commit vote information to database") #Post-processing # move the images from unproc to proc log.debug("Renaming") procs = [ filen(proc1d, next_ballot_number + m) + const.filename_extension for m in range(const.num_pages) ] for a, b in zip(unprocs, procs): try: os.rename(a, b) except OSError as e: log.info("Could not rename %s" % a) util.fatal("Could not rename %s", a) total_images_processed += const.num_pages # Tell caller you've processed all images of this ballot log.debug("Requesting next") util.writeto(util.root("nexttoprocess.txt"), next_ballot_number + const.num_pages) # update next ballot file with next image number log.debug("Done writing nexttoprocess.txt") #print "%d extracted. " % (next_ballot_number,) log.info("%d images processed", const.num_pages) # for profiling # hp.heap().dump('prof.hpy');hp.setref();gc.collect(); # hp.setref();hp.heap().dump('prof.hpy') except FileNotPresentException, e: print e sys.stdout.flush()
def __init__(self, next_file, inc): self.inc = inc self.next_file = next_file self.next = int(util.readfrom(next_file, 1))