Example #1
0
    def ocr_box_file(self, box_fname):
        # Set up the names of output files
        replace = lambda s: box_fname.replace('.box', s)

        asis_fname = replace('.ml.txt')
        nogram_out_fname = replace('.nogram.txt')
        ngram_out_fname = replace('.gram.txt')

        log_fname = replace('.{}.log'.format(self.loglevelname))
        logging.basicConfig(filename=log_fname,
                            level=self.loglevel,
                            filemode="w")

        # Read Bantries & get Most likely output
        bf = BantryFile(box_fname)
        with open(asis_fname, 'w', encoding='utf-8') as f:
            f.write(post_process(bf.text))

        # Process using ngrams
        ngrammed_lines, notgrammed_lines = [], []
        for linenum in range(bf.num_lines):
            print("Line ", linenum)
            line_bantries = bf.get_line_bantires(linenum)
            gramgraph = GramGraph(line_bantries)
            gramgraph.process_tree()
            notgrammed_lines.append(gramgraph.get_best_apriori_str())
            ngrammed_lines.append(gramgraph.get_best_str())

        nogram_out = post_process("\n".join(notgrammed_lines))
        with open(nogram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(nogram_out)

        ngram_out = post_process("\n".join(ngrammed_lines))
        with open(ngram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(ngram_out)

        print("Input : ", box_fname)
        print("As is output : ", asis_fname)
        print("Without ngram : ", nogram_out_fname)
        print("With ngram : ", ngram_out_fname)
        print("Log : ", log_fname)
Example #2
0
    def ocr_box_file(self, box_fname):
        # Set up the names of output files
        replace = lambda s: box_fname.replace('.box', s)

        asis_fname = replace('.ml.txt')
        nogram_out_fname = replace('.nogram.txt')
        ngram_out_fname = replace('.gram.txt')

        log_fname = replace('.{}.log'.format(self.loglevelname))
        logging.basicConfig(filename=log_fname,
                            level=self.loglevel,
                            filemode="w")

        # Read Bantries & get Most likely output
        bf = BantryFile(box_fname)
        with open(asis_fname, 'w', encoding='utf-8') as f:
            f.write(post_process(bf.text))

        # Process using ngrams
        ngrammed_lines, notgrammed_lines = [], []
        for linenum in range(bf.num_lines):
            print("Line ", linenum)
            line_bantries = bf.get_line_bantires(linenum)
            gramgraph = GramGraph(line_bantries)
            gramgraph.process_tree()
            notgrammed_lines.append(gramgraph.get_best_apriori_str())
            ngrammed_lines.append(gramgraph.get_best_str())

        nogram_out = post_process("\n".join(notgrammed_lines))
        with open(nogram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(nogram_out)

        ngram_out = post_process("\n".join(ngrammed_lines))
        with open(ngram_out_fname, 'w', encoding='utf-8') as out_file:
            out_file.write(ngram_out)

        print("Input : ", box_fname)
        print("As is output : ", asis_fname)
        print("Without ngram : ", nogram_out_fname)
        print("With ngram : ", ngram_out_fname)
        print("Log : ", log_fname)
if __name__ == "__main__":
    import sys
    from scaler import ScalerFactory
    from bantry import Bantry, BantryFile
    from classifier import Classifier
    from ngram import Ngram

    nnet_file = sys.argv[1] if len(sys.argv) > 1 else "library/nn.pkl"
    banti_file_name = sys.argv[2] if len(sys.argv) > 2 else "sample_images/praasa.box"
    scaler_prms_file = sys.argv[3] if len(sys.argv) > 3 else "scalings/relative48.scl"
    labellings_file = sys.argv[4] if len(sys.argv) > 4 else "labellings/alphacodes.lbl"
    ngram_file = "library/mega.123.pkl"

    Bantry.scaler = ScalerFactory(scaler_prms_file)
    Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=1)
    bf = BantryFile(banti_file_name)

    ngram = Ngram(ngram_file)
    GramGraph.set_ngram(ngram)

    for linenum in range(bf.num_lines):
        print('*' * 80)
        bantires = bf.get_line_bantires(linenum)
        gramgraph = GramGraph(bantires)
        gramgraph.process_tree()
        gramgraph.find_top_ngram_paths()
        for node, children in enumerate(gramgraph.lchildren):
            print(gramgraph.top_pathnodes_at(node, 1))
        print(gramgraph.get_best_str('|'))
        print(gramgraph.get_best_apriori_str('|'))
Example #4
0
log_fname = replace('.{}.log'.format(logging._levelToName[loglevel]).lower())
asis_fname = replace('.ml.txt')
nogram_out_fname = replace('.nogram.txt')
ngram_out_fname = replace('.gram.txt')

logging.basicConfig(filename=log_fname, level=loglevel, filemode="w")

############################## Set-up scaler, classifier, ngram etc.
Bantry.scaler = ScalerFactory(scaler_fname)
Bantry.classifier = Classifier(nnet_fname, labels_fname, logbase=1)
ng = Ngram(ngram_fname)
Bantry.ngram = ng
GramGraph.set_ngram(ng)

############################## Read Bantries & get Most likely output
bf = BantryFile(banti_fname)
with open(asis_fname, 'w', encoding='utf-8') as f:
    f.write(post_process(bf.text))

############################## Process using ngrams
ngrammed_lines, notgrammed_lines = [], []

for linenum in range(bf.num_lines):
    print("Line ", linenum)
    line_bantries = bf.get_line_bantires(linenum)
    gramgraph = GramGraph(line_bantries)
    gramgraph.process_tree()
    notgrammed_lines.append(gramgraph.get_best_apriori_str())
    ngrammed_lines.append(gramgraph.get_best_str())

nogram_out = post_process("\n".join(notgrammed_lines))
with open(scaler_prms_file, 'r') as sfp:
    scaler_prms = ast.literal_eval(sfp.read())

with open(nnet_prms_file_name, 'rb') as nnet_prms_file:
    nnet_prms = pickle.load(nnet_prms_file)

with open(labelings_file_name, encoding='utf-8') as labels_fp:
    labellings = ast.literal_eval(labels_fp.read())

# print(labellings)
chars = LabelToUnicodeConverter(labellings).onecode

############################################# Init Network
Bantry.scaler = ScalerFactory(scaler_prms)
bf = BantryFile(banti_file_name)

nnet_prms['training_params']['BATCH_SZ'] = 1
ntwk = NeuralNet(**nnet_prms)
tester = ntwk.get_data_test_model(go_nuts=True)

############################################# Image saver
dir_name = os.path.basename(nnet_prms_file_name)[:-7] + '/'
if not os.path.exists(dir_name):
    os.makedirs(dir_name)
namer = (dir_name + '{:03d}_{}_{:02d}.png').format
print("Look for me in :", dir_name)

def saver(outs, ch, debug=True):
    saver.index += 1
    for i, out in enumerate(outs):
Example #6
0
    from bantry import Bantry, BantryFile
    from classifier import Classifier
    from ngram import Ngram

    nnet_file = sys.argv[1] if len(sys.argv) > 1 else "library/nn.pkl"
    banti_file_name = sys.argv[2] if len(
        sys.argv) > 2 else "sample_images/praasa.box"
    scaler_prms_file = sys.argv[3] if len(
        sys.argv) > 3 else "scalings/relative48.scl"
    labellings_file = sys.argv[4] if len(
        sys.argv) > 4 else "labellings/alphacodes.lbl"
    ngram_file = "library/mega.123.pkl"

    Bantry.scaler = ScalerFactory(scaler_prms_file)
    Bantry.classifier = Classifier(nnet_file, labellings_file, logbase=1)
    bf = BantryFile(banti_file_name)

    ngram = Ngram(ngram_file)
    GramGraph.set_ngram(ngram)

    for linenum in range(bf.num_lines):
        print('*' * 80)
        bantires = bf.get_line_bantires(linenum)
        gramgraph = GramGraph(bantires)
        gramgraph.process_tree()
        gramgraph.find_top_ngram_paths()
        for node, children in enumerate(gramgraph.lchildren):
            print(gramgraph.top_pathnodes_at(node, 1))
        print(gramgraph.get_best_str('|'))
        print(gramgraph.get_best_apriori_str('|'))