def run_metalex_test(): #------------------ Argparse commands configuration ------------------------- metalexArgsParser = argparse.ArgumentParser( prog='metalex', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( colored(''' --------------------------------------------------------------- | * * * * * * * * * * * * * * * * ** ** | | * * * * * * * * * * * * * * | | * * * * * * * * * * * * * * ** ** | --------------------------------------------------------------- metalex is general tool for lexicographics and metalexicographics activities ''', 'blue', attrs=['blink', 'bold'])), epilog=textwrap.dedent(''' ------------------------------------------------------------------------------ metalex project : special Thank to Bill for metalex-vagrant version ------------------------------------------------------------------------------ '''), prefix_chars='-') metalexArgsParser.add_argument('-v', '--version', action='version', version='%(prog)s v0.2') metalexArgsParser.add_argument('-p', '--project', help='Defined %(prog)s project name', dest='projectName', action='store') metalexArgsParser.add_argument( '-c', '--confproject', action='store', help='Defined %(prog)s configuration for the current project', dest='confProject', nargs=3, metavar=('author', 'comment', 'contributors')) metalexArgsParser.add_argument( '-i', '--dicimage', dest='imageFile', action='append', nargs='?', help= 'Input one or multiple dictionary image(s) file(s) for current %(prog)s project' ) metalexArgsParser.add_argument( '-d', '--imagedir', help= 'Input folder name of dictionary image files for current %(prog)s project', type=str, required=True, action='store', dest='imagesDir') metalexArgsParser.add_argument( '--imgalg', help= 'Set algorithm for enhancing dictionary image files for current %(prog)s project (actiontype must be : constrat or bright or filter)', type=str, action='store', nargs=2, dest='imgalg', metavar=('actiontype', 'value')) metalexArgsParser.add_argument( '-r', '--filerule', dest='fileRule', type=str, help='Defined file rules that we use to enhance quality of OCR result') metalexArgsParser.add_argument( '-l', '--lang', help= 'Set language for optical characters recognition and others %(prog)s treatment', type=str) metalexArgsParser.add_argument( '-x', '--xml', help='Defined output result treatment of %(prog)s', type=str, nargs=3, choices=('xml', 'lmf', 'tei')) metalexArgsParser.add_argument( '-s', '--save', help='Save output result of the current project in files', action='store_true') metalexArgsParser.add_argument( '-t', '--terminal', help='Show result of the current treatment in the terminal', action='store_true', default=False) # ----Build contains args------------------------------------------------ metalexArgs = metalexArgsParser.parse_args() # ----Generate real path of images--------------------------------------- imagelist = [] if metalexArgs.imageFile: imagelist.append(metalexArgs.imageFile) elif metalexArgs.imagesDir: content = './' + metalexArgs.imagesDir + '/*.*g' for imagefile in glob.glob(content): name = os.getcwd() + '/' + imagefile imagelist.append(name) if len(imagelist) < 1: message = u"Your current directory don't have image(s)" dico.logs.manageLog.write_log(message, typ='warm') else: message = u"You must define folder containing image of dictionary or image of dictionary for your project otherwise default folder must be use" dico.logs.manageLog.write_log(message, typ='warm') for imagefile in glob.glob('imagesInput/*.*g'): name = os.getcwd() + '/' + imagefile imagelist.append(name) if len(imagelist) < 1: message = u"Your current directory don't have image(s)" dico.logs.manageLog.write_log(message, typ='warm') # ----Defined New project name------------------------------------------- if metalexArgs.projectName: project = dico.NewProject(metalexArgs.projectName) else: message = u"Your current project name is not set! Please correct it otherwise default name must be use" project = dico.NewProject(u'metalex_projectName') dico.logs.manageLog.write_log(message, typ='warm') # ----Set metadata for the current project------------------------------- if metalexArgs.confProject: author, comment, contrib = metalexArgs.confProject[ 0], metalexArgs.confProject[1], metalexArgs.confProject[2] project.set_conf_project(author, comment, contrib) else: message = u'Please set metadata for the current project. default metadata data must be apply' dico.logs.manageLog.write_log(message, typ='error') project.set_conf_project(u'metalex_user', u'Comment_user', u'metalex_contributors') # ----Input dictionary images to project--------------------------------- images = project.metalex.get_images(imagelist) # ----Enhance quality of dictionary image files ------------------------- if metalexArgs.imgalg: actionType, value = metalexArgs.imgalg if actionType == 'constrat': images.EnhanceImages().constrast(value) elif actionType == 'bright': images.EnhanceImages().bright(value) elif actionType == 'filter': images.EnhanceImages().filter(f.DETAIL) else: message = u"Your input string 'actiontype' don't match (constrat or bright or filter)" dico.logs.manageLog.write_log(message, typ='warm') else: images.EnhanceImages().filter(f.DETAIL) # ----Start optical recognition of dictionary image files---------------- if metalexArgs.save and metalexArgs.lang: execOcr = images.BuildOcrImages(save=True, langIn=metalexArgs.lang) execOcr.image_to_text() elif metalexArgs.lang: execOcr = images.BuildOcrImages(save=False, langIn=metalexArgs.lang) execOcr.image_to_text() elif metalexArgs.terminal and metalexArgs.lang: execOcr = images.BuildOcrImages(show=True, langIn=metalexArgs.lang) execOcr.image_to_text() else: execOcr = images.BuildOcrImages(save=True, langIn='fra') execOcr.image_to_text() # ----Normalize result of ocr files ------------------------------------ if metalexArgs.fileRule: execNormalize = images.BuildTextWell(metalexArgs.fileRule) execNormalize.make_text_well() else: message = u"FileRule() >> You don't defined file rules for this project. *file_Rule.dic* will be used instead" execNormalize = images.BuildTextWell(u'../../file_Rule.dic') dico.logs.manageLog.write_log(message, typ='warm') execNormalize.make_text_well() #-----Produce HTML output file for project------------------------------ if metalexArgs.save: images.dico_html(save=metalexArgs.save) baliseXML = images.BaliseXML() if metalexArgs.xml: baliseXML.put_xml(save=metalexArgs.save, typ=metalexArgs.xml) else: images.dico_html(save=False)
def run_metalex_test(self): #------------------ Argparse commands configuration ------------------------- metalexArgsParser = argparse.ArgumentParser( prog='metalex', formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent( colored(''' --------------------------------------------------------------- | * * * * * * * * * * * * * * * * ** ** | | * * * * * * * * * * * * * * | | * * * * * * * * * * * * * * ** ** | --------------------------------------------------------------- metalex is general tool for lexicographics and metalexicographics activities ''', 'blue', attrs=['blink', 'bold'])), epilog=textwrap.dedent(''' ------------------------------------------------------------------------------ metalex project : special Thank to Bill for metalex-vagrant version ------------------------------------------------------------------------------ '''), prefix_chars='-') metalexArgsParser.add_argument('-v', '--version', action='version', version='%(prog)s v0.2') metalexArgsParser.add_argument('-p', '--project', help='Defined %(prog)s project name', dest='projectName', action='store') metalexArgsParser.add_argument( '-c', '--confproject', action='store', help='Defined %(prog)s configuration for the current project', dest='confProject', nargs=3, metavar=('author', 'comment', 'contributors')) metalexArgsParser.add_argument( '-i', '--dicimage', dest='imageFile', action='append', nargs='?', help= 'Input one or multiple dictionary image(s) file(s) for current %(prog)s project' ) metalexArgsParser.add_argument( '--dld', dest='download', help= 'Download ocropy model from Github for current %(prog)s project') metalexArgsParser.add_argument( '-o', '--ocrtype', dest='ocrType', choices=('ocropy', 'tesserocr'), help='OCR type to use for current %(prog)s project', type=str, default="tesserocr") metalexArgsParser.add_argument( '-m', '--model', dest='modelRef', choices=('modeldef', ''), help='OCR LSTM model to use for current %(prog)s project', type=str) metalexArgsParser.add_argument( '-d', '--imagedir', action='store', help= 'Input folder name of dictionary image files for current %(prog)s project', type=str, dest='imagesDir') metalexArgsParser.add_argument('--imgalg', type=str, action='store', nargs=2, dest='imgalg', help='Set algorithm for enhancing dictionary image files for current'+\ ' %(prog)s project (actiontype must be : contrast or bright or filter)', metavar=('actiontype', 'value'), choices=('contrast', 'bright', 'filter')) metalexArgsParser.add_argument( '-r', '--filerule', dest='fileRule', type=str, help= 'Defined file rules that we use to enhance quality of OCR result') metalexArgsParser.add_argument( '-l', '--lang', type=str, help= 'Set language for optical characters recognition and others %(prog)s treatment' ) metalexArgsParser.add_argument( '-x', '--xml', help='Defined output result treatment of %(prog)s', type=str, nargs=1, choices=('xml', 'lmf', 'tei')) metalexArgsParser.add_argument( '-s', '--save', help='Save output result of the current project in files', action='store_true') metalexArgsParser.add_argument( '-t', '--terminal', help='Show result of the current treatment in the terminal', action='store_true', default=False) # ----Build contains args------------------------------------------------ metalexArgs = metalexArgsParser.parse_args() # ----- Download ocropy model file ----------------------------------- # -- https://stackoverflow.com/questions/3249524/print-in-one-line-dynamically sysPl = platform.system() home = '' if sysPl == 'Linux': home = os.environ.get('HOME') elif sysPl == 'Windows': home = os.environ.get('HOMEDRIVE') + os.environ.get('HOMEPATH') modelOcropy = home + '/metalex/models/' if not os.path.exists(modelOcropy): os.makedirs(modelOcropy) if metalexArgs.download: if metalexArgs.download == 'modeldef': url = "https://github.com/Levis0045/MetaLex/raw/master/"+\ "metalex/plugins/ocropy/models/en-default.pyrnn.gz" file_name, u = url.split('/')[-1], urllib2.urlopen(url) save = modelOcropy + file_name f = open(save, 'wb') meta = u.info() file_size = int(meta.getheaders("Content-Length")[0]) message = "Downloading ocropy model: %s | Bytes: %s" % ( file_name, file_size) print('\n' + message + '\n') file_size_dl = 0 block_sz = 8192 while True: buffer = u.read(block_sz) if not buffer: break file_size_dl += len(buffer) f.write(buffer) status = r"%10d [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size) status = "\tProcessing:" + status + chr(8) * (len(status) + 1) Printer(status) f.close() resp = '\n\nDownload complete [Save at $home/metalex/models/%s] \n' % file_name return resp # ----Generate real path of images--------------------------------------- imagelist = [] if metalexArgs.imageFile: imagelist = [os.path.abspath(x) for x in metalexArgs.imageFile] elif metalexArgs.imagesDir: content = metalexArgs.imagesDir + '/*.*' for imagefile in glob.glob(content): name = os.path.abspath(imagefile) imagelist.append(name) if len(imagelist) < 1: message = u"Your current directory don't have image(s)" metalex.logs.manageLog.write_log(message, typ='warm') else: message = u"You must define folder containing image of dictionary or image"+\ u" of dictionary for your project otherwise default folder must be use" metalex.logs.manageLog.write_log(message, typ='warm') for imagefile in glob.glob('test-files/images/*.*'): name = os.path.abspath(imagefile) imagelist.append(name) if len(imagelist) < 1: message = u"Your current directory don't have image(s)" metalex.logs.manageLog.write_log(message, typ='warm') # ----Defined New project name------------------------------------------- if metalexArgs.projectName: project = metalex.NewProject(metalexArgs.projectName) else: message = u"Your current project name is not set! "+\ u"Please correct it otherwise default name must be use" project = metalex.NewProject(u'metalex_projectName') metalex.logs.manageLog.write_log(message, typ='warm') # ----Set metadata for the current project------------------------------- if metalexArgs.confProject: author = metalexArgs.confProject[0] comment = metalexArgs.confProject[1] contrib = metalexArgs.confProject[2] project.set_conf_project(author, comment, contrib) else: message = u'Please set metadata for the current project. Default metadata data must be apply' metalex.logs.manageLog.write_log(message, typ='error') project.set_conf_project(u'metalex_user', u'Comment_user', u'metalex_contributors') # ----Input dictionary images to project--------------------------------- images = project.metalex.get_images(imagelist) # ----Enhance quality and Start optical recognition of dictionary image files---------------- model = '' if metalexArgs.modelRef != 'modeldef': model = metalexArgs.modelRef elif metalexArgs.modelRef == 'modeldef': modelpath = modelOcropy + '/en-default.pyrnn.gz' if not os.path.exists(modelpath): message = "Ocropy model not found : download it first with --dld" sys.exit(metalex.logs.manageLog.write_log(message, typ='warm')) model = modelpath if metalexArgs.ocrType == 'tesserocr' and metalexArgs.save: execOcr = images.run_img_to_text(typ=metalexArgs.ocrType, save=True, langIn=metalexArgs.lang) if metalexArgs.imgalg: actionType, value = metalexArgs.imgalg if actionType == 'contrast': execOcr.enhance_img_quality(typ='contrast', value=value) elif actionType == 'bright': execOcr.enhance_img_quality(typ='bright', value=value) elif actionType == 'filter': execOcr.enhance_img_quality(typ='filter') else: message = u"Your input string 'actiontype' don't match"+\ u"(contrast or bright or filter)" metalex.logs.manageLog.write_log(message, typ='warm') else: execOcr.enhance_img(typ='filter') execOcr.run_ocr() elif metalexArgs.ocrType == 'ocropy' and metalexArgs.save: execOcr = images.run_img_to_text(typ=metalexArgs.ocrType, save=True, langIn='fra') execOcr.run_ocr(model) elif metalexArgs.terminal and metalexArgs.lang: execOcr = images.run_img_to_text(typ=metalexArgs.ocrType, save=False, langIn=metalexArgs.lang) execOcr.run_ocr(model) # ----Normalize result of ocr files ------------------------------------ if metalexArgs.fileRule: execNormalize = images.BuildTextWell(metalexArgs.fileRule) execNormalize.make_text_well() else: message = u"FileRule() >> You don't defined file rules for this project."+\ u" *file_Rule.dic* will be used instead" execNormalize = images.BuildTextWell(u'test-files/file_Rule.dic') metalex.logs.manageLog.write_log(message, typ='warm') execNormalize.make_text_well() #-----Produce HTML output file for project------------------------------ if metalexArgs.save: images.dico_html(save=metalexArgs.save) baliseXML = images.BaliseXML() if metalexArgs.xml: baliseXML.put_xml(save=metalexArgs.save, typ=metalexArgs.xml) else: images.dico_html(save=False)