def run(): with h5py.File('HWDB1.1.hdf5', 'w') as f: for name, size, dirpath in [('trn', TRN_SIZE, trn_dirpath), ('tst', TST_SIZE, tst_dirpath)]: print('Converting \'%s\'...' % name) grp = f.create_group(name) dset_bitmap = grp.create_dataset('bitmap', (size, 64, 64, 1), dtype='uint8') dset_tagcode = grp.create_dataset('tagcode', (size, 1), dtype='uint16') sample_num = 0 for i, (bitmap, tagcode) in enumerate(utils.read_gnt_in_directory(dirpath)): dset_bitmap[i] = utils.normalize_bitmap(bitmap) dset_tagcode[i] = tagcode sample_num += 1 print("Sample Number: {0}".format(sample_num))
#!/usr/bin/env python # This script can be used to see how bitmaps will be preprocessed before # subsetting the CASIA HWDB1.1 data set import sys import matplotlib.cm as cm import matplotlib.pyplot as plt import numpy as np import utils if len(sys.argv) != 2: print 'Usage: %s gnt_dirpath' % sys.argv[0] exit() gnt_dirpath = sys.argv[1] for i, (bitmap, tagcode) in enumerate(utils.read_gnt_in_directory(gnt_dirpath)): print utils.tagcode_to_unicode(tagcode).encode( 'utf-8') # wrong terminal encoding = garbage proc_bitmap = utils.normalize_bitmap(bitmap) proc_bitmap = utils.preprocess_bitmap(proc_bitmap) plt.subplot(121) plt.imshow(bitmap, cmap=cm.Greys_r) plt.subplot(122) plt.imshow(np.squeeze(proc_bitmap, axis=0), cmap=cm.Greys_r) plt.show()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # This script counts the characters of the CASIA HWDB1.1 data set import sys from collections import Counter, defaultdict import utils if len(sys.argv) != 3: print('Usage: %s trn_dirpath tst_dirpath' % sys.argv[0]) exit() trn_dirpath = sys.argv[1] tst_dirpath = sys.argv[2] frequencies = defaultdict(Counter) for bitmap, tagcode in utils.read_gnt_in_directory(trn_dirpath): tagcode_unicode = utils.tagcode_to_unicode(tagcode) frequencies[tagcode_unicode].update(trn=1) for bitmap, tagcode in utils.read_gnt_in_directory(tst_dirpath): tagcode_unicode = utils.tagcode_to_unicode(tagcode) frequencies[tagcode_unicode].update(tst=1) with open('frequencies.txt', 'w') as f: for k, v in sorted(frequencies.items(), key=lambda k_v: k_v[1]['trn'], reverse=True): f.write('%s: %d, %d\n' % (k.encode('utf-8'), v['trn'], v['tst']))
#!/usr/bin/env python2 # This script can be used to see how bitmaps will be preprocessed before # subsetting the CASIA HWDB1.1 data set import sys import matplotlib.cm as cm import matplotlib.pyplot as plt import numpy as np import utils if len(sys.argv) != 2: print 'Usage: %s gnt_dirpath' % sys.argv[0] exit() gnt_dirpath = sys.argv[1] for i, (bitmap, tagcode) in enumerate(utils.read_gnt_in_directory(gnt_dirpath)): print utils.tagcode_to_unicode(tagcode).encode('utf-8') # wrong terminal encoding = garbage proc_bitmap = utils.normalize_bitmap(bitmap) proc_bitmap = utils.preprocess_bitmap(proc_bitmap) plt.subplot(121) plt.imshow(bitmap, cmap=cm.Greys_r) plt.subplot(122) plt.imshow(np.squeeze(proc_bitmap, axis=0), cmap=cm.Greys_r) plt.show()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import h5py import utils if len(sys.argv) != 3: print('Usage: %s trn_dirpath tst_dirpath' % sys.argv[0]) sys.exit(1) trn_dirpath = sys.argv[1] tst_dirpath = sys.argv[2] with h5py.File('HWDB1.1.hdf5', 'w') as f: for name, size, dirpath in [('trn', 897758, trn_dirpath), ('tst', 223991, tst_dirpath)]: print('Converting \'%s\'...' % name) grp = f.create_group(name) dset_bitmap = grp.create_dataset('bitmap', (size, 1, 64, 64), dtype='uint8') dset_tagcode = grp.create_dataset('tagcode', (size, 1), dtype='uint16') for i, (bitmap, tagcode) in enumerate(utils.read_gnt_in_directory(dirpath)): dset_bitmap[i] = utils.normalize_bitmap(bitmap) dset_tagcode[i] = tagcode
import sys import h5py import utils if len(sys.argv) != 3: print('Usage: %s trn_dirpath tst_dirpath' % sys.argv[0]) sys.exit(1) trn_dirpath = sys.argv[1] tst_dirpath = sys.argv[2] with h5py.File('HWDB1.1.hdf5', 'w') as f: for name, size, dirpath in [('trn', 897758, trn_dirpath), ('tst', 223991, tst_dirpath)]: print('Converting \'%s\'...' % name) grp = f.create_group(name) dset_bitmap = grp.create_dataset('bitmap', (size, 1, 64, 64), dtype='uint8') dset_tagcode = grp.create_dataset('tagcode', (size, 1), dtype='uint16') dset_writercode = grp.create_dataset('writercode', (size, 1), dtype='uint16') for i, (bitmap, tagcode, writercode) in enumerate(utils.read_gnt_in_directory(dirpath)): dset_bitmap[i] = utils.normalize_bitmap(bitmap) dset_tagcode[i] = tagcode dset_writercode[i] = writercode
#!/usr/bin/env python2 # This script counts the characters of the CASIA HWDB1.1 data set import sys from collections import Counter, defaultdict import utils if len(sys.argv) != 3: print 'Usage: %s trn_dirpath tst_dirpath' % sys.argv[0] exit() trn_dirpath = sys.argv[1] tst_dirpath = sys.argv[2] frequencies = defaultdict(Counter) for bitmap, tagcode in utils.read_gnt_in_directory(trn_dirpath): tagcode_unicode = utils.tagcode_to_unicode(tagcode) frequencies[tagcode_unicode].update(trn=1) for bitmap, tagcode in utils.read_gnt_in_directory(tst_dirpath): tagcode_unicode = utils.tagcode_to_unicode(tagcode) frequencies[tagcode_unicode].update(tst=1) with open('frequencies.txt', 'w') as f: for k, v in sorted(frequencies.iteritems(), key=lambda (k, v): v['trn'], reverse=True): f.write('%s: %d, %d\n' % (k.encode('utf-8'), v['trn'], v['tst']))