Example #1
0
def words_norm(location, output):
    output = os.path.join(location, output)
    if not os.path.exists(output):
        os.makedirs(output)
    else:
        print("THIS DATASET IS BEING SKIPPED")
        print("Output folder already exists:", output)
        return 1

    imgs = glob.glob(os.path.join(location, data_folder, '*.png'))
    length = len(imgs)

    for i, img_path in enumerate(imgs):
        image = cv2.imread(img_path)
        # Simple check for invalid images
        if image.shape[0] > 20:
            cv2.imwrite(
                os.path.join(output, os.path.basename(img_path)),
                word_normalization(image,
                                   height=64,
                                   border=False,
                                   tilt=False,
                                   hyst_norm=False))
        print(i)
        print_progress_bar(i, len(imgs))

    print("\tNumber of normalized words:",
          len([n for n in os.listdir(output)]))
Example #2
0
def extract(location, output, number=4):
    output = os.path.join(location, output)
    if not os.path.exists(output):
        os.makedirs(output)

    for sub in ['ORAND-CAR-2014/CAR-A', 'ORAND-CAR-2014/CAR-B']:
        folder = os.path.join(location, sub)
        l_files = glob.glob(os.path.join(folder, '*.txt'))
        length = sum(1 for fl in l_files for line in open(fl))

        itr = 0
        for fl in l_files:
            im_folder = fl[:-6] + 'images'
            with open(fl) as f:
                for line in f:
                    im, word = line.strip().split('\t')
                    impath = os.path.join(im_folder, im)

                    if os.stat(impath).st_size != 0:
                        outpath = os.path.join(
                            output,
                            '%s_%s_%s.png' % (word, number, time.time()))
                        copyfile(impath, outpath)
                    print_progress_bar(i, length)
                    itr += 1

    print("\tNumber of words:", len([n for n in os.listdir(output)]))
def create_csv(datadir):
    print('Converting word images to CSV...')
    img_paths = {
        'train': glob.glob(os.path.join(datadir, 'train', '*.png')),
        'dev': glob.glob(os.path.join(datadir, 'dev', '*.png')),
        'test': glob.glob(os.path.join(datadir, 'test', '*.png'))}
    
    for split in ['train', 'dev', 'test']:
        labels = np.array([
            os.path.basename(name).split('_')[0] for name in img_paths[split]])
        length = len(img_paths[split])
        images = np.empty(length, dtype=object)

        for i, img in enumerate(img_paths[split]):
            gaplines = 'None'
            if os.path.isfile(img[:-3] + 'txt'):
                with open(img[:-3] + 'txt', 'r') as fp:
                    gaplines = str(simplejson.load(fp))[1:-1]
            images[i] = (cv2.imread(img, 0), gaplines)
            print_progress_bar(i, length)

        with open(os.path.join(datadir, split + '.csv'), 'w') as csvfile:
            fieldnames = ['label', 'shape', 'image', 'gaplines']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for i in range(length):
                writer.writerow({
                    fieldnames[0]: labels[i],
                    fieldnames[1]: str(images[i].shape)[1:-1],
                    fieldnames[2]: str(list(images[i][0].flatten()))[1:-1],
                    fieldnames[3]: images[i][1]
                })

    print('\tCSV files created!')
Example #4
0
def extract(location, output, number=5):
    output = os.path.join(location, output)
    if not os.path.exists(output):
        os.makedirs(output)

    for sub in ['lob', 'numbers']:
        folder = os.path.join(location, sub)
        seg_files = glob.glob(os.path.join(folder, '*.seg'))
        length = sum([int(open(l, 'r').readline()) for l in seg_files])

        itr = 0
        for fl in seg_files:
            image = cv2.imread(fl[:-4] + ".tiff")
            with open(fl) as f:
                f.readline()
                for line in f:
                    rect = [int(val) for val in line.strip().split(' ')[1:]]
                    word = line.split(' ')[0].split('_')[0]
                    im = image[rect[2]:rect[3], rect[0]:rect[1]]

                    if 0 not in im.shape:
                        cv2.imwrite(
                            os.path.join(
                                output,
                                '%s_%s_%s.png' % (word, number, time.time())),
                            im)
                    print_progress_bar(itr, length)
                    itr += 1

    print("\tNumber of words:", len([n for n in os.listdir(output)]))
Example #5
0
def extract(location, output, number=2):
    output = os.path.join(location, output)
    err_output = os.path.join(location, 'words_with_error')
    if not os.path.exists(output):
        os.makedirs(output)
    if not os.path.exists(err_output):
        os.makedirs(err_output)

    folder = os.path.join(location, 'words')
    label_file = os.path.join(location, 'words.txt')
    length = len(open(label_file).readlines())

    with open(label_file) as fp:
        for i, line in enumerate(fp):
            if line[0] != '#':
                l = line.strip().split(" ")
                impath = os.path.join(
                    folder, l[0].split('-')[0],
                    l[0].split('-')[0] + '-' + l[0].split('-')[1],
                    l[0] + '.png')
                word = l[-1]

                if (os.stat(impath).st_size != 0
                        and word not in ['.', '-', "'"]
                        and not any(i in word for i in prohibited)):

                    out = output if l[1] == 'ok' else err_output
                    outpath = os.path.join(
                        out, "%s_%s_%s.png" % (word, number, time.time()))
                    copyfile(impath, outpath)

            print_progress_bar(i, length)
    print("\tNumber of words:", len([n for n in os.listdir(output)]))
Example #6
0
def extract(location, output, number=1):
    output = os.path.join(location, output)
    if not os.path.exists(output):
        os.makedirs(output)

    for sub in ['words', 'archive', 'cz_raw', 'en_raw']:
        folder = os.path.join(location, sub)

        img_list = os.listdir(os.path.join(folder))
        for i, data in enumerate(img_list):
            word = data.split('_')[0]
            img = os.path.join(folder, data)
            out = os.path.join(
                output,
                '%s_%s_%s.png' % (word, number, data.split('_')[-1][:-4]))
            Image.open(img).save(out)
            print_progress_bar(i, len(img_list))

    print("\tNumber of words:", len([n for n in os.listdir(output)]))
Example #7
0
def extract(location, output, number=3):
    output = os.path.join(location, output)
    if not os.path.exists(output):
        os.makedirs(output)

    for sub in ['cvl-database-1-1/testset', 'cvl-database-1-1/trainset']:
        folder = os.path.join(location, sub)
        images = glob.glob(os.path.join(folder, 'words', '*', '*.tif'))

        for i, im in enumerate(images):
            word = re.search('\/\d+-\d+-\d+-\d+-(.+?).tif', im).group(1)
            word = unidecode.unidecode(word)

            if os.stat(im).st_size != 0:
                outpath = os.path.join(
                    output, '%s_%s_%s.png' % (word, number, time.time()))
                Image.open(im).save(outpath)
            print_progress_bar(i, len(images))

    print("\tNumber of words:", len([n for n in os.listdir(output)]))
Example #8
0
        os.makedirs(output_folder)

    # imgs = glob.glob(os.path.join(folder, '*/words-final/*.png'))
    imgs = []
    for ds in args.dataset:
        for loc, _, _ in os.walk(datasets[ds][1].replace("raw", "processed")):
            imgs += glob.glob(os.path.join(loc, '*.png'))

    imgs.sort()
    random.shuffle(imgs)
    
    length = len(imgs)
    sp1 = int(0.8 * length)
    sp2 = int(0.9 * length)
    img_paths = {'train': imgs[:sp1], 'dev': imgs[sp1:sp2], 'test': imgs[sp2:]}
    
    i = 0
    for split in ['train', 'dev', 'test']:
        split_output = os.path.join(output_folder, split)
        if not os.path.exists(split_output):
            os.mkdir(split_output)
        for im_path in img_paths[split]:
            # Copy image
            print_progress_bar(i, length)
            i += 1
        print(
            "\tNumber of %s words: %s" % (split, len(os.listdir(split_output))))

    if args.csv:
        create_csv(output_folder)