def read_parameters(config_file): configs = read_json(config_file) engine = configs["engine"] schema = read_json('engines/schemas/%s.schema' % engine) new_configs = read_parameter_from_schema(schema) for attr in new_configs: if attr in configs: new_configs[attr] = configs[attr] # updated_configs = Config(new_configs) return new_configs
def get_old_traineddata(config): old_config_file = config["continue_from"]["config"] old_config = read_json(os.path.join(config_root, old_config_file)) old_train = config["continue_from"]["trainset"] old_model_dir = get_model_dir(old_train, old_config_file) common_schema = read_json("engines/schemas/common.schema") old_model_prefix = old_config["model_prefix"] if "model_prefix" in old_config \ else common_schema["definitions"]["model_prefix"]["default"] old_traineddata = os.path.join(model_root, old_model_dir, old_model_prefix, old_model_prefix + '.traineddata') return old_traineddata
def read_layer_info(layername): layer_schema = read_json("engines/schemas/models/layer_%s.schema" % layername) default_values = {} for key in layer_schema["definitions"]: default_values[key] = layer_schema["definitions"][key]["default"] return default_values
def load_jsonref(ref_path): ref_file, ref_propty, ref_attr = ref_path.rsplit('/', 2) ref_file = ref_file[:-1] schema_path = '%s/engines/schemas/%s' % (os.getcwd(), ref_file) print(schema_path) ref_schema = read_json(schema_path) return ref_schema[ref_propty][ref_attr]
def validate_model(model, engine): err_str = [] if type(model) is not list: err_str.append('parameter model, model should be a list of arrays') return err_str layers = read_json('engines/schemas/models/layer_all_%s.schema' % engine)["definitions"] layer_no = 0 for ele in model: if type(ele) is not dict: err_str.append('parameter model, layer %d must be a dictionary' % layer_no) continue if len(ele.keys()) != 1: err_str.append( 'parameter model, layer %d must only contains one layer' % layer_no) continue for key in ele: if key not in layers: err_str.append( 'parameter model, layer %s is not defined in the model of engine %s' % (key, engine)) else: resolver = RefResolver( 'file://%s/engines/schemas/models/' % os.getcwd(), None) validator = Draft4Validator(layers[key], resolver=resolver) for error in validator.iter_errors(ele): err_str.append('parameter model, layer %s, %s' % (key, error.message)) layer_no += 1 return err_str
def delete(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) if config_content["engine"] == 'tesseract': if file_name.endswith('.traineddata'): out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) os.remove(out_file) elif config_content["engine"] == 'calamari': files = os.listdir(os.path.join(os.getcwd(), model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] for out_file in files: os.remove(out_file) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) os.remove(out_file) return redirect( url_for('manage_model_list', trainset=trainset, config=config))
def download(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) print(config_content["engine"]) if config_content["engine"] == 'tesseract': if '.traineddata' in file_name: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) elif config_content["engine"] == "calamari": if file_name != 'report': out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name + '.tar.gz') if os.path.exists(out_file): os.remove(out_file) files = os.listdir(os.path.join(model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] compress_file(files, out_file) file_name += '.tar.gz' else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) return send_file(out_file, attachment_filename=file_name, as_attachment=True)
def read_model_param(engine): engine_schema = read_json('engines/schemas/models/model_%s.schema' % engine) model = engine_schema["definitions"]["model"] layers = model["items"]["oneOf"] help_info = [] for layer in layers: layer_def = load_jsonref('models/' + layer["$ref"])["properties"] for key in layer_def: cur_layer = layer_def[key] help_info.append( [key, '', "dictionary", '', cur_layer["description"]]) for para in layer_def[key]["properties"]: para_def = load_jsonref("models/" + cur_layer["properties"][para]["$ref"]) # print(para_def) if "enum" in para_def: help_info.append([ '', para, para_def["type"], para_def["default"], 'Allowed values: ' + ', '.join(map(str, para_def["enum"])) + '. ' + para_def["description"] ]) else: help_info.append([ '', para, para_def["type"], para_def["default"], para_def["description"] ]) return help_info
def upload(): trainset = request.args.get('trainname', None) config = request.args.get('config', None) model_dir = get_model_dir(trainset, config) file_name = request.args.get("filename", None) config_content = read_json(os.path.join(config_root, config)) print(config_content["engine"]) if config_content["engine"] == 'tesseract': if '.traineddata' in file_name: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, "checkpoint", file_name) elif config_content["engine"] == "calamari": if file_name != 'report': out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name + '.tar.gz') if os.path.exists(out_file): os.remove(out_file) files = os.listdir(os.path.join(model_root, model_dir)) files = [ os.path.join(model_root, model_dir, ele) for ele in files if ele.startswith(file_name) ] compress_file(files, out_file) file_name += '.tar.gz' else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) else: out_file = os.path.join(os.getcwd(), model_root, model_dir, file_name) print('uploading...') publish_model( access_token=app.token, model_file=out_file, # local path remote_file='my_image.jpg', # remote name (no path) ocr_engine=config_content[ "engine"], # OCR engine which can run the model license_name= 'WTFPL', # it seems that Zenodo recognizes acronyms, such as this one metadata={ # insert whatever you want in this map 'info': 'this map can contain anything; if you do not want it, set it to none', 'content': 'ideally it should contain all information about the training data, the parameters, the result accuracy, ...', 'usage': 'this gets uploaded as metadata.json along with the model' }, related_DOI=[ ('cites', '123') ], # should other DOI be refered to, add them here as pairs (link, doi), otherwise set this to None is_draft= True # if true, then the publish request will not be sent and the upload will stay as a draft ) print('uploaded!') return redirect( url_for('manage_model_list', trainset=trainset, config=config))
def read_parameter_default(engine): common_schema = read_json("engines/schemas/common.schema") default_values = {} for key in common_schema["definitions"]: if common_schema["definitions"][key]["type"] != "object": default_values[key] = common_schema["definitions"][key]["default"] engine_schema = read_json("engines/schemas/engine_%s.schema" % engine) for key in engine_schema["properties"]: if "$ref" not in engine_schema["properties"][key]: if engine_schema["properties"][key]["type"] == "object": for ele in engine_schema["properties"][key]["properties"]: new_key = '%s_%s' % (key, ele) default_values[new_key] = engine_schema["properties"][key][ "properties"][ele]["default"] else: default_values[key] = engine_schema["properties"][key][ "default"] return default_values
def read_help_information(engine): schema = read_json('schemas/%s.schema' % engine) attrs = schema['properties'] help_info = 'OCR Engine: %s\n' % engine help_info += 'Parameters: \n' for k in attrs: help_info += '\t --%s\tdefault:[%s]\t%s\n' % ( k, str(attrs[k]["default"]), attrs[k]["description"]) return help_info
def __init__(self, file_config, model_dir): self.configs = read_json(pjoin(config_root, file_config)) self.engine = self.configs["engine"] self.model_dir = model_dir self.translator = read_json('engines/schemas/translate.json')[ self.engine] # load default values # self.default = read_parameter_default(self.engine) # replace default values with user specified values self.values = read_value(self.configs, self.engine) if "continue_from" in self.configs: print('transalte path') self.values["continue_from"] = translate_continue_path( self.configs["engine"], self.configs["continue_from"]) else: self.values["continue_from"] = '' if "model" in self.configs: self.model_translator = ModelTranslator(self.configs["model"], self.engine) elif "continue_from" not in self.configs: self.configs["model"] = read_json( 'engines/schemas/models/default_model_%s.schema' % self.engine) self.model_translator = ModelTranslator(self.configs["model"], self.engine) self.model_prefix = self.values["model_prefix"] self.nepoch = self.values['nepoch'] partition = self.values['partition'] self.ntrain, self.ntest = split_train_test(data_folder, tmp_folder, partition, engine=self.engine) self.translate() self.cmd_list = [act_environ(self.engine)] + self.cmd_list + [deact_environ]\ if self.engine != 'tesseract' else self.cmd_list
def read_model_info(engine): engine_schema = read_json('engines/schemas/models/model_%s.schema' % engine) model = engine_schema["definitions"]["model"] layers = model["items"]["oneOf"] help_info = [] help_info.append(["model", '', model["type"], '', model["description"]]) for layer in layers: layer_def = load_jsonref('models/' + layer["$ref"])["properties"] for key in layer_def: cur_layer = layer_def[key] help_info.append( ['', key, "dictionary", '', cur_layer["description"]]) return help_info
def validate_string(config_str): config, err = read_config_str(config_str) # read configuration file if len(err) > 0: errors = ['Configuration file is not valid dictionary.'] for e in err: errors.append('\t' + e) return errors common_schema = read_json( 'engines/schemas/common.schema') # valid against common schema errors = validate(common_schema, config) if len(errors) > 0: return errors errors = [] engine = config["engine"] engine_schema = read_json('engines/schemas/engine_%s.schema' % engine) errors_model = [] if "model" in config: # valid against model schema model = config["model"] errors_model = validate_model(model, engine) errors += errors_model if "continue_from" in config: errors += validate_continue_from( engine_schema["properties"]["continue_from"], config) del config["continue_from"] elif "append" in config: errors += [ 'parameter append, please specify the model structure to append.' ] # errors += errors_continue if "model" in config: del config["model"] errors += validate(engine_schema, config) # valid against engine schema nerr = len(errors) for i in range(nerr): errors[i] = 'Error %d: %s' % (i, errors[i]) return errors
def read_help_information_html(engine): schema = read_json('engines/schemas/engine_%s.schema' % engine) attrs = schema['properties'] help_info = [] for k in attrs: print(k, attrs[k]) if k == "model": help_info += read_model_info(engine) continue elif "$ref" in attrs[k]: ref_path = attrs[k]["$ref"] cur_node = load_jsonref(ref_path) if cur_node["type"] == "object": help_info += read_object(k, cur_node) continue elif attrs[k]["type"] == "object": cur_node = attrs[k] help_info += read_object(k, cur_node) continue else: cur_node = attrs[k] if cur_node["type"] == "number": help_info.append([ k, '', cur_node["format"], str(cur_node["default"]), cur_node["description"] ]) else: if "enum" in cur_node: help_info.append([ k, '', cur_node["type"], str(cur_node["default"]), "Allowed Value: " + ', '.join(cur_node["enum"]) + '. ' + cur_node["description"] ]) else: help_info.append([ k, '', cur_node["type"], str(cur_node["default"]), cur_node["description"] ]) return help_info
def valid_from_file(file_train, file_config): configs = read_json(pjoin('static/configs', file_config)) model_dir = get_model_dir(file_train, file_config) engine = configs["engine"] common_schema = read_json("engines/schemas/common.schema") model_prefix = configs["model_prefix"] if "model_prefix" in configs \ else common_schema["definitions"]["model_prefix"]["default"] # noinspection PyInterpreter dict_res, best_perform, best_model = read_report(model_dir) # write the evaluation report f_out = open(pjoin(model_root, model_dir, 'report'), 'w') model_postfix = get_model_postfixes(engine, model_dir, model_prefix) if len(model_postfix) == 0: return 'No model yet.' for index in model_postfix: if int(index) not in dict_res: model_file = get_model_path(engine, model_prefix, index) model_path = pjoin(model_root, model_dir, model_file) if engine == 'tesseract': cmd_list = [ 'export TESSDATA_PREFIX=%s' % pjoin(os.getcwd(), model_root, model_dir), '/Users/doreen/Documents/Experiment/Package/tesseract/src/training/lstmtraining --stop_training --continue_from %s --traineddata %s --model_output %s' % (model_path, pjoin(model_root, model_dir, model_prefix, '%s.traineddata' % model_prefix), pjoin(model_root, model_dir, model_prefix + '.traineddata')) ] convert_image(valid_folder) image_files = get_all_files(data_folder=valid_folder, postfix='.tif') for imf in image_files: cmd_list.append( 'tesseract -l %s %s/%s.tif %s/%s ' % (model_prefix, valid_folder, imf, valid_folder, imf)) else: cmd_list = [act_environ(engine)] cmd_list.append(get_cmd(engine, model_path)) cmd_list.append('conda deactivate') cmd = '\n'.join(cmd_list) subprocess.run(cmd, shell=True) gt_files = [ valid_folder + '/' + ele for ele in os.listdir(valid_folder) if ele.endswith('.gt.txt') ] if engine == 'calamari': res_str = evaluate(gt_files, flag_confusion=0, extension='.pred.txt') else: res_str = evaluate(gt_files, flag_confusion=0) if float(res_str["char_error_rate"]) > best_perform: best_perform = float(res_str["char_error_rate"]) best_model = index f_out.write( 'Iteration: %s, character errors: %d, total characters: %d, char error rate: %s, word errors: %d, total words: %d, word error rate: %s\n' % (index, res_str["char_errs"], res_str["char_total"], res_str["char_error_rate"], res_str["word_errs"], res_str["word_total"], res_str["word_error_rate"])) else: res_str = dict_res[int(index)] f_out.write('Iteration: %s, %s\n' % (res_str[1], res_str[0])) copy_best_model(engine, model_dir, model_prefix, best_model) # from_file('train_500.tar.gz', 'sample_calamari.json') # eval_from_file(model_dir='tess_new', engine='tesseract', model_prefix='tess')
def process_kraken_reshape_size(config): old_model = read_json( os.path.join(config_root, config["continue_from"]["config"]))["model"] input_size = get_old_input_size(old_model, config["append"]) return input_size
def validate_continue_from(continue_from_schema, new_config): err_str = [] engine = new_config["engine"] # check whether continue from parameter is valid against the engine schema resolver = RefResolver('file://%s/engines/schemas/' % os.getcwd(), None) validator = Draft4Validator(continue_from_schema, resolver=resolver) continue_from = new_config["continue_from"] for error in validator.iter_errors(continue_from): err_str.append('parameter continue_from, %s' % error.message) if len(err_str) > 0: return err_str # check whether engine matches if not os.path.exists(os.path.join(config_root, continue_from["config"])): err_str.append( 'parameter continue_from, configuration for the old model does not exists' ) return err_str old_config = read_json(os.path.join(config_root, continue_from["config"])) if old_config["engine"] != new_config["engine"]: err_str.append( 'parameter engine, engines for old model and new model not match') return err_str # check whether the path to continue from exists model_dir = get_model_dir(continue_from["trainset"], continue_from["config"]) if engine == 'tesseract': model_path = os.path.join(model_root, model_dir, 'checkpoint', continue_from["model"]) else: model_path = os.path.join(model_root, model_dir, continue_from["model"]) if engine == 'calamari': model_path += '.json' if not os.path.exists(model_path): err_str.append('parameter continue_from, model does not exist') # check whether the model structure is right if "model" in new_config: if engine == 'calamari': if new_config['model'] != old_config["model"]: err_str.append( 'parameter model, old model and new model must match for calamari.' ) return err_str elif engine == 'ocropus': err_str.append( 'parameters model, ocropus does not support new model structure for fine tuning.' ) return err_str if "append" in new_config: append_index = new_config["append"] if "append" not in new_config or append_index < 1: err_str.append( 'parameter append, please assign a valid append') new_model = new_config["model"] old_model = old_config["model"] len_old_model = len( old_model) if "input" in old_model[0] else len(old_model) + 1 len_old_model = len_old_model - 1 if "output" in old_model[ -1] else len_old_model if append_index >= len_old_model: err_str.append( 'parameter append, append_index must be less than the number of layers (excluding output layer, including input layer).' ) concat_model = old_model[:append_index] if "input" in old_model[ 0] else old_model[:append_index - 1] concat_model += new_model else: if "append" in new_config: err_str.append( 'parameter append, please specify the model structure to append.' ) if engine == 'calamari': if "model" in old_config: err_str.append( 'parameter model, old model and new model must match for calamari' ) return err_str
def __init__(self, model, engine): self.model = model self.engine = engine self.translator = read_json( "engines/schemas/models/translate_model.json")