def main(data, vocab, training, model, output): # Load configs dir_output = output config = Config([data, vocab, training, model]) config.save(dir_output) vocab = Vocab(config) # Load datasets train_set = DataGenerator(path_formulas=config.path_formulas_train, dir_images=config.dir_images_train, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_train, path_matching=config.path_matching_train, max_len=config.max_length_formula, form_prepro=vocab.form_prepro) val_set = DataGenerator(path_formulas=config.path_formulas_val, dir_images=config.dir_images_val, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_val, path_matching=config.path_matching_val, max_len=config.max_length_formula, form_prepro=vocab.form_prepro) # Define learning rate schedule n_batches_epoch = ((len(train_set) + config.batch_size - 1) // config.batch_size) lr_schedule = LRSchedule(lr_init=config.lr_init, start_decay=config.start_decay*n_batches_epoch, end_decay=config.end_decay*n_batches_epoch, end_warm=config.end_warm*n_batches_epoch, lr_warm=config.lr_warm, lr_min=config.lr_min) # Build model and train model = Img2SeqModel(config, dir_output, vocab) model.build_train(config) model.train(config, train_set, val_set, lr_schedule)
def main(data, vocab, training, model, output): # Load configs dir_output = output config = Config([data, vocab, training, model]) config.save(dir_output) vocab = Vocab(config) # Load datasets train_set = DataGenerator(path_formulas=config.path_formulas_train, dir_images=config.dir_images_train, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_train, path_matching=config.path_matching_train, max_len=config.max_length_formula, form_prepro=vocab.form_prepro) all_img = [] all_formula = [] for i, (_img, _formula) in enumerate(minibatches(train_set, batch_size)): all_img.append(_img) if _formula is not None: _formula, _formula_length = pad_batch_formulas( _formula, vocab.id_pad, vocab.id_end ) all_formula.append(_formula) np.save('np_formula', np.array(all_formula)) np.save('np_img', np.array(all_img)) print("DONE EXPORTING NUMPY FILES") return None val_set = DataGenerator(path_formulas=config.path_formulas_val, dir_images=config.dir_images_val, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_val, path_matching=config.path_matching_val, max_len=config.max_length_formula, form_prepro=vocab.form_prepro) # Define learning rate schedule n_batches_epoch = ((len(train_set) + config.batch_size - 1) // config.batch_size) lr_schedule = LRSchedule(lr_init=config.lr_init, start_decay=config.start_decay*n_batches_epoch, end_decay=config.end_decay*n_batches_epoch, end_warm=config.end_warm*n_batches_epoch, lr_warm=config.lr_warm, lr_min=config.lr_min) # Build model and train model = Img2SeqModel(config, dir_output, vocab) model.build_train(config) model.train(config, train_set, val_set, lr_schedule)
def main(results): # restore config and model dir_output = results config_data = Config(dir_output + "data.json") config_vocab = Config(dir_output + "vocab.json") config_model = Config(dir_output + "model.json") vocab = Vocab(config_vocab) model = Img2SeqCtcModel(config_model, dir_output, vocab) model.build_pred() model.restore_session(dir_output + "model.weights") # load dataset test_set = DataGenerator(path_formulas=config_data.path_formulas_test, dir_images=config_data.dir_images_test, img_prepro=greyscale, max_iter=config_data.max_iter, bucket=config_data.bucket_test, path_matching=config_data.path_matching_test, max_len=config_data.max_length_formula, form_prepro=vocab.form_prepro,) # use model to write predictions in files config_eval = Config({"dir_answers": dir_output + "formulas_test/", "batch_size": 20}) files, perplexity = model.write_prediction(config_eval, test_set) formula_ref, formula_hyp = files[0], files[1] # score the ref and prediction files scores = score_files(formula_ref, formula_hyp) scores["perplexity"] = perplexity msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) model.logger.info("- Test Txt: {}".format(msg))
def main(data, vocab): data_config = Config(data) # datasets train_set = DataGenerator(path_formulas=data_config.path_formulas_train, dir_images=data_config.dir_images_train, path_matching=data_config.path_matching_train) # test_set = DataGenerator( # path_formulas=data_config.path_formulas_test, # dir_images=data_config.dir_images_test, # path_matching=data_config.path_matching_test) # val_set = DataGenerator( # path_formulas=data_config.path_formulas_val, # dir_images=data_config.dir_images_val, # path_matching=data_config.path_matching_val) # # # produce images and matching files # train_set.build(buckets=data_config.buckets) # test_set.build(buckets=data_config.buckets) # val_set.build(buckets=data_config.buckets) # vocab vocab_config = Config(vocab) vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) write_vocab(vocab, vocab_config.path_vocab)
def main(results): # restore config and model dir_output = results config_data = Config(dir_output + "data.json") config_vocab = Config(dir_output + "vocab.json") config_model = Config(dir_output + "model.json") vocab = Vocab(config_vocab) model = Img2SeqModel(config_model, dir_output, vocab) model.build_pred() model.restore_session(dir_output + "model.weights/") # load dataset test_set = DataGenerator(path_formulas=config_data.path_formulas_test, dir_images=config_data.dir_images_test, img_prepro=greyscale, max_iter=config_data.max_iter, bucket=config_data.bucket_test, path_matching=config_data.path_matching_test, max_len=config_data.max_length_formula, form_prepro=vocab.form_prepro,) # build images from formulas formula_ref = path.join(dir_output, "formulas_test/ref.txt") formula_hyp = path.join(dir_output, "formulas_test/hyp_0.txt") images_ref = path.join(dir_output, "images_test/ref/") images_test = path.join(dir_output, "images_test/hyp_0/") build_images(load_formulas(formula_ref), images_ref) build_images(load_formulas(formula_hyp), images_test) # score the repositories scores = score_dirs(images_ref, images_test, greyscale) msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()]) model.logger.info("- Eval Img: {}".format(msg))
def main(data, vocab): data_config = Config(data) # datasets train_set = DataGenerator(path_formulas=data_config.path_formulas_train, dir_images=data_config.dir_images_train, path_matching=data_config.path_matching_train) test_set = DataGenerator(path_formulas=data_config.path_formulas_test, dir_images=data_config.dir_images_test, path_matching=data_config.path_matching_test) val_set = DataGenerator(path_formulas=data_config.path_formulas_val, dir_images=data_config.dir_images_val, path_matching=data_config.path_matching_val) # produce images and matching files train_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7) test_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7) val_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7) # vocab vocab_config = Config(vocab) vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) write_vocab(vocab, vocab_config.path_vocab)
def main(data, vocab): data_config = Config(data) # datasets train_set = DataGenerator(path_formulas=data_config.path_formulas_train, dir_images=data_config.dir_images_train, path_matching=data_config.path_matching_train) test_set = DataGenerator(path_formulas=data_config.path_formulas_test, dir_images=data_config.dir_images_test, path_matching=data_config.path_matching_test) val_set = DataGenerator(path_formulas=data_config.path_formulas_val, dir_images=data_config.dir_images_val, path_matching=data_config.path_matching_val) # produce images and matching files # train_set.build(buckets=None, n_threads=1) train_set.build(buckets=None) test_set.build(buckets=None) val_set.build(buckets=None) # train_set.build(buckets=data_config.buckets) # test_set.build(buckets=data_config.buckets) # val_set.build(buckets=data_config.buckets) # p = Augmentor.Pipeline(data_config.dir_images_train) # p.zoom(probability=1, min_factor=1.1, max_factor=1.5) # # p.process() # augmented_images, labels = p.sample(3) # print labels # vocab vocab_config = Config(vocab) vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok) write_vocab(vocab, vocab_config.path_vocab)
dir_output = "results/small/" config = Config([ "configs/data_small.json", "configs/vocab_small.json", "configs/training_small.json", "configs/model.json", ]) config.save(dir_output) vocab = Vocab(config) train_set = DataGenerator( path_formulas=config.path_formulas_train, dir_images=config.dir_images_train, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_train, path_matching=config.path_matching_train, max_len=config.max_length_formula, form_prepro=vocab.form_prepro ) val_set = DataGenerator( path_formulas=config.path_formulas_val, dir_images=config.dir_images_val, img_prepro=greyscale, max_iter=config.max_iter, bucket=config.bucket_val, path_matching=config.path_matching_val, max_len=config.max_length_formula, form_prepro=vocab.form_prepro )