Ejemplo n.º 1
0
def main(data, vocab, training, model, output):
    # Load configs
    dir_output = output
    config = Config([data, vocab, training, model])
    config.save(dir_output)
    vocab = Vocab(config)

    # Load datasets
    train_set = DataGenerator(path_formulas=config.path_formulas_train,
            dir_images=config.dir_images_train, img_prepro=greyscale,
            max_iter=config.max_iter, bucket=config.bucket_train,
            path_matching=config.path_matching_train,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)
    val_set = DataGenerator(path_formulas=config.path_formulas_val,
            dir_images=config.dir_images_val, img_prepro=greyscale,
            max_iter=config.max_iter, bucket=config.bucket_val,
            path_matching=config.path_matching_val,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)

    # Define learning rate schedule
    n_batches_epoch = ((len(train_set) + config.batch_size - 1) //
                        config.batch_size)
    lr_schedule = LRSchedule(lr_init=config.lr_init,
            start_decay=config.start_decay*n_batches_epoch,
            end_decay=config.end_decay*n_batches_epoch,
            end_warm=config.end_warm*n_batches_epoch,
            lr_warm=config.lr_warm,
            lr_min=config.lr_min)

    # Build model and train
    model = Img2SeqModel(config, dir_output, vocab)
    model.build_train(config)
    model.train(config, train_set, val_set, lr_schedule)
Ejemplo n.º 2
0
def main(data, vocab, training, model, output):
    # Load configs
    dir_output = output
    config = Config([data, vocab, training, model])
    config.save(dir_output)
    vocab = Vocab(config)

    # Load datasets
    train_set = DataGenerator(path_formulas=config.path_formulas_train,
            dir_images=config.dir_images_train, img_prepro=greyscale,
            max_iter=config.max_iter, bucket=config.bucket_train,
            path_matching=config.path_matching_train,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)

    
    all_img = []
    all_formula = []
    for i, (_img, _formula) in enumerate(minibatches(train_set, batch_size)):
        all_img.append(_img)
        if _formula is not None:
            _formula, _formula_length = pad_batch_formulas(
            _formula,
            vocab.id_pad,
            vocab.id_end
        )
        all_formula.append(_formula)
    
    np.save('np_formula', np.array(all_formula))
    np.save('np_img', np.array(all_img))

    print("DONE EXPORTING NUMPY FILES")
    return None
    val_set = DataGenerator(path_formulas=config.path_formulas_val,
            dir_images=config.dir_images_val, img_prepro=greyscale,
            max_iter=config.max_iter, bucket=config.bucket_val,
            path_matching=config.path_matching_val,
            max_len=config.max_length_formula,
            form_prepro=vocab.form_prepro)

    # Define learning rate schedule
    n_batches_epoch = ((len(train_set) + config.batch_size - 1) //
                        config.batch_size)
    lr_schedule = LRSchedule(lr_init=config.lr_init,
            start_decay=config.start_decay*n_batches_epoch,
            end_decay=config.end_decay*n_batches_epoch,
            end_warm=config.end_warm*n_batches_epoch,
            lr_warm=config.lr_warm,
            lr_min=config.lr_min)

    # Build model and train
    model = Img2SeqModel(config, dir_output, vocab)
    model.build_train(config)
    model.train(config, train_set, val_set, lr_schedule)
Ejemplo n.º 3
0
def main(results):
    # restore config and model
    dir_output = results

    config_data  = Config(dir_output + "data.json")
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")

    vocab = Vocab(config_vocab)
    model = Img2SeqCtcModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights")

    # load dataset
    test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
            dir_images=config_data.dir_images_test, img_prepro=greyscale,
            max_iter=config_data.max_iter, bucket=config_data.bucket_test,
            path_matching=config_data.path_matching_test,
            max_len=config_data.max_length_formula,
            form_prepro=vocab.form_prepro,)

    # use model to write predictions in files
    config_eval = Config({"dir_answers": dir_output + "formulas_test/",
                          "batch_size": 20})
    files, perplexity = model.write_prediction(config_eval, test_set)
    formula_ref, formula_hyp = files[0], files[1]

    # score the ref and prediction files
    scores = score_files(formula_ref, formula_hyp)
    scores["perplexity"] = perplexity
    msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
    model.logger.info("- Test Txt: {}".format(msg))
Ejemplo n.º 4
0
def main(data, vocab):
    data_config = Config(data)

    # datasets
    train_set = DataGenerator(path_formulas=data_config.path_formulas_train,
                              dir_images=data_config.dir_images_train,
                              path_matching=data_config.path_matching_train)
    # test_set  = DataGenerator(
    #     path_formulas=data_config.path_formulas_test,
    #     dir_images=data_config.dir_images_test,
    #     path_matching=data_config.path_matching_test)
    # val_set   = DataGenerator(
    #     path_formulas=data_config.path_formulas_val,
    #     dir_images=data_config.dir_images_val,
    #     path_matching=data_config.path_matching_val)
    #
    # # produce images and matching files
    # train_set.build(buckets=data_config.buckets)
    # test_set.build(buckets=data_config.buckets)
    # val_set.build(buckets=data_config.buckets)

    # vocab
    vocab_config = Config(vocab)
    vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
    write_vocab(vocab, vocab_config.path_vocab)
Ejemplo n.º 5
0
def main(results):
    # restore config and model
    dir_output = results

    config_data  = Config(dir_output + "data.json")
    config_vocab = Config(dir_output + "vocab.json")
    config_model = Config(dir_output + "model.json")

    vocab = Vocab(config_vocab)
    model = Img2SeqModel(config_model, dir_output, vocab)
    model.build_pred()
    model.restore_session(dir_output + "model.weights/")

    # load dataset
    test_set = DataGenerator(path_formulas=config_data.path_formulas_test,
            dir_images=config_data.dir_images_test, img_prepro=greyscale,
            max_iter=config_data.max_iter, bucket=config_data.bucket_test,
            path_matching=config_data.path_matching_test,
            max_len=config_data.max_length_formula,
            form_prepro=vocab.form_prepro,)


    # build images from formulas
    formula_ref = path.join(dir_output, "formulas_test/ref.txt")
    formula_hyp = path.join(dir_output, "formulas_test/hyp_0.txt")
    images_ref  = path.join(dir_output, "images_test/ref/")
    images_test = path.join(dir_output, "images_test/hyp_0/")
    build_images(load_formulas(formula_ref), images_ref)
    build_images(load_formulas(formula_hyp), images_test)

    # score the repositories
    scores = score_dirs(images_ref, images_test, greyscale)
    msg = " - ".join(["{} {:04.2f}".format(k, v) for k, v in scores.items()])
    model.logger.info("- Eval Img: {}".format(msg))
Ejemplo n.º 6
0
def main(data, vocab):
    data_config = Config(data)

    # datasets
    train_set = DataGenerator(path_formulas=data_config.path_formulas_train,
                              dir_images=data_config.dir_images_train,
                              path_matching=data_config.path_matching_train)
    test_set = DataGenerator(path_formulas=data_config.path_formulas_test,
                             dir_images=data_config.dir_images_test,
                             path_matching=data_config.path_matching_test)
    val_set = DataGenerator(path_formulas=data_config.path_formulas_val,
                            dir_images=data_config.dir_images_val,
                            path_matching=data_config.path_matching_val)

    # produce images and matching files
    train_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7)
    test_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7)
    val_set.build(buckets=data_config.buckets, fontIndex=0, fontLength=7)

    # vocab
    vocab_config = Config(vocab)
    vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
    write_vocab(vocab, vocab_config.path_vocab)
Ejemplo n.º 7
0
def main(data, vocab):
    data_config = Config(data)

    # datasets
    train_set = DataGenerator(path_formulas=data_config.path_formulas_train,
                              dir_images=data_config.dir_images_train,
                              path_matching=data_config.path_matching_train)
    test_set = DataGenerator(path_formulas=data_config.path_formulas_test,
                             dir_images=data_config.dir_images_test,
                             path_matching=data_config.path_matching_test)
    val_set = DataGenerator(path_formulas=data_config.path_formulas_val,
                            dir_images=data_config.dir_images_val,
                            path_matching=data_config.path_matching_val)

    # produce images and matching files
    # train_set.build(buckets=None, n_threads=1)
    train_set.build(buckets=None)
    test_set.build(buckets=None)
    val_set.build(buckets=None)
    # train_set.build(buckets=data_config.buckets)
    # test_set.build(buckets=data_config.buckets)
    # val_set.build(buckets=data_config.buckets)

    # p = Augmentor.Pipeline(data_config.dir_images_train)
    # p.zoom(probability=1, min_factor=1.1, max_factor=1.5)
    # # p.process()
    # augmented_images, labels = p.sample(3)
    # print labels

    # vocab
    vocab_config = Config(vocab)
    vocab = build_vocab([train_set], min_count=vocab_config.min_count_tok)
    write_vocab(vocab, vocab_config.path_vocab)
Ejemplo n.º 8
0
dir_output = "results/small/"
config = Config([
    "configs/data_small.json",
    "configs/vocab_small.json",
    "configs/training_small.json",
    "configs/model.json",
])
config.save(dir_output)
vocab = Vocab(config)

train_set = DataGenerator(
    path_formulas=config.path_formulas_train,
    dir_images=config.dir_images_train,
    img_prepro=greyscale,
    max_iter=config.max_iter,
    bucket=config.bucket_train,
    path_matching=config.path_matching_train,
    max_len=config.max_length_formula,
    form_prepro=vocab.form_prepro
)
val_set = DataGenerator(
    path_formulas=config.path_formulas_val,
    dir_images=config.dir_images_val,
    img_prepro=greyscale,
    max_iter=config.max_iter,
    bucket=config.bucket_val,
    path_matching=config.path_matching_val,
    max_len=config.max_length_formula,
    form_prepro=vocab.form_prepro
)