Beispiel #1
0
    def __init__(self,
                 gw_root_dir,
                 image_extension='.png',
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 use_bigrams=False,
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks
        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError(
                'embedding must be one of phoc, tsne, spoc or dctow')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        self.path = gw_root_dir

        #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))]
        #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))]

        train_test_mat = scipy.io.loadmat(
            os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat'))

        gt_file = os.path.join(gw_root_dir, 'info.gtp')
        words = []
        train_split_ids = []
        test_split_ids = []
        cnt = 0
        for line in open(gt_file):
            if not line.startswith("#"):
                word_info = line.split()
                img_name = word_info[-1]
                transcr = word_info[-2]

                img_paths = img_name.split('-')
                word_img_filename = img_paths[0] + '/' + \
                                    img_paths[0] + '-' + img_paths[1] + '/' + \
                                    img_name + image_extension

                word_img_filename = os.path.join(gw_root_dir, 'words',
                                                 word_img_filename)

                if not os.path.isfile(word_img_filename):
                    continue

                # print word_img_filename
                try:
                    word_img = img_io.imread(word_img_filename)
                except:
                    continue
                # scale black pixels to 1 and white pixels to 0
                word_img = 1 - word_img.astype(np.float32) / 255.0

                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                words.append((word_img, transcr.lower()))
                '''
                if '-'.join(img_paths[:-1]) in train_img_names:
                    train_split_ids.append(1)
                else:
                    train_split_ids.append(0)
                if '-'.join(img_paths[:-1]) in test_img_names:
                    test_split_ids.append(1)
                else:
                    test_split_ids.append(0)
                cnt += 1
                '''

        #self.train_ids = train_split_ids
        #self.test_ids = test_split_ids

        self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')]
        self.test_ids = [x[0] for x in train_test_mat.get('idxTest')]

        self.words = words

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            # extract unigrams

            unigrams = [
                chr(i)
                for i in range(ord('a'),
                               ord('z') + 1) + range(ord('0'),
                                                     ord('9') + 1)
            ]
            # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
            if use_bigrams:
                bigram_levels = [2]
                bigrams = get_most_common_n_grams(word_strings)
            else:
                bigram_levels = None
                bigrams = None

            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                bigram_levels=bigram_levels,
                phoc_bigrams=bigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)
    def __init__(
        self,
        phoc_layout: PhocLayout,
        root_dir='data/',
        embedding='phoc',
        min_image_width_height=30,
        fixed_image_size=None,
        max_wordlength=20,
    ):
        '''
        We need to fill in:
        self.words                          list of tuples: (word_img, transcr, page_id). word_img is an intensity matrix, transcr is a string with the transcription, page_id holds word info (optional?)
        self.split_ids                      list of ids: tag each word with a partition label (here, training=1, validation=2, test=3)
        self.word_embeddings                list of targets that correspond to the words (PHOC embeddings or word lengths)

        To be filled-in automatically:
        self.label_encoder                  compute a mapping from class string to class id. Initialize after filling-in self.words.        
        self.query_list                     this is defined in MainLoader.
        '''
        def xml2jpg(xml):
            base, ext = os.path.splitext(xml)
            return base + '.JPG'

        self.TRAINING_PARTITION = 1
        self.VALIDATION_PARTITION = 2
        self.TEST_PARTITION = 3
        if embedding not in ['phoc', 'wordlength']:
            raise ValueError('embedding must be either phoc or wordlength')

        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None
        self.fixed_image_size = fixed_image_size

        # Specify images of the set
        all_xmls = []
        for x in range(1, 48):
            if x == 12:
                continue  #Page 12 was omitted / doesn't exist
            all_xmls.append(os.path.join(root_dir, '_00{0:02d}.xml'.format(x)))
        # load the dataset
        self.words = []
        self.split_ids = []
        word_id = 1
        for page_id in all_xmls:
            doc_img = img_io.imread(xml2jpg(page_id))
            doc_img = np.mean(doc_img, axis=2)  # inputs of sophia are colour
            doc_img = 1 - doc_img.astype(
                np.float32
            ) / 255.0  # scale black pixels to 1 and white pixels to 0
            for word in get_words_from_pagexml(page_id):
                x, y, w, h = word[1]
                word_img = doc_img[y:y + h, x:x + w].copy()
                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                # Decide on split_id (this comes from footnote on page 3 of Sfikas et al.2015)
                if word_id >= 1 and word_id <= 2000:
                    current_split_id = self.TRAINING_PARTITION
                elif word_id >= 2001 and word_id <= 4000:
                    current_split_id = self.TEST_PARTITION
                elif word_id >= 4001 and word_id <= 4941:
                    current_split_id = self.VALIDATION_PARTITION
                else:
                    raise ValueError(
                        'Word id read out of bounds (={}); it should have been in [1,4941].'
                        .format(current_split_id))
                transcr = word[2]
                self.words.append((word_img, transcr, page_id))
                self.split_ids.append(current_split_id)
                word_id += 1

        self.label_encoder = LabelEncoder()
        word_strings = [elem[1] for elem in self.words]
        self.label_encoder.fit(word_strings)

        self.word_embeddings = None
        if embedding == 'phoc':
            self.word_embeddings = phoc_layout.build_phoc_descriptor(
                word_strings)
        elif embedding == 'wordlength':
            self.word_embeddings = []
            for x in word_strings:
                tt = np.zeros([
                    max_wordlength,
                ])
                try:
                    tt[len(x) - 1] = 1
                except IndexError:
                    print(
                        'Word length (for word "{}") over max word length ({})'
                        .format(x, max_wordlength))
                    exit(1)
                self.word_embeddings.append(tt)
            self.word_embeddings = np.array(self.word_embeddings)
        else:
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)
Beispiel #3
0
    def __init__(self,
                 gw_root_dir,
                 image_extension='.png',
                 cv_split_method=None,
                 cv_split_idx=None,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks

        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError('embedding must be one of phoc, spoc or dctow')
        if cv_split_method not in [None, 'almazan', 'fifepages']:
            raise ValueError(
                'cv_split_method must be one of None, almazan or fifepages')
        if cv_split_idx is not None and cv_split_method is None:
            raise ValueError(
                'if cv_split_idx is not None, you need to choose a cv_split_method'
            )

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        # load the dataset
        img_filenames = sorted([
            elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages'))
            if elem.endswith(image_extension)
        ])
        words = []
        for img_filename in img_filenames:
            page_id = '.'.join(img_filename.split('.')[:-1])
            doc_img = img_io.imread(
                os.path.join(gw_root_dir, 'pages', img_filename))
            # scale black pixels to 1 and white pixels to 0
            doc_img = 1 - doc_img.astype(np.float32) / 255.0
            annotation_filename = '.'.join(
                img_filename.split('.')[:-1] + ['gtp'])
            annotation_lines = LineListIO.read_list(
                os.path.join(gw_root_dir, 'ground_truth', annotation_filename))
            # each line is the annotation of a word image in the following format
            #    <ul_x> <ul_y> <lr_x> <lr_y> <transcription>
            for line in annotation_lines:
                ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ')
                ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(
                    lr_y)
                word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy()
                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32)
                words.append((word_img, transcr, page_id))

        self.words = words
        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # extract unigrams from train split
        unigrams = [
            chr(i)
            for i in range(ord('a'),
                           ord('z') + 1) + range(ord('0'),
                                                 ord('9') + 1)
        ]
        #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)

        self.cv_split_method = cv_split_method
        self.cv_split_index = cv_split_idx

        #train_split = None
        #test_split = None
        if cv_split_method is not None:
            if cv_split_method == 'almazan':
                # CV splits as done in Almazan 2014
                self.split_ids = np.load(
                    os.path.join(gw_root_dir, 'almazan_cv_indices.npy'))

            else:
                # fifepages CV
                raise NotImplementedError()
    def __init__(self, gw_root_dir, image_extension='.png',
                 cv_split_method=None, cv_split_idx=None,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks

        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError('embedding must be one of phoc, spoc or dctow')
        if cv_split_method not in [None, 'almazan', 'fifepages']:
            raise ValueError('cv_split_method must be one of None, almazan or fifepages')
        if cv_split_idx is not None and cv_split_method is None:
            raise ValueError('if cv_split_idx is not None, you need to choose a cv_split_method')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        # load the dataset
        img_filenames = sorted([elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages'))
                                if elem.endswith(image_extension)])
        words = []
        for img_filename in img_filenames:
            page_id = '.'.join(img_filename.split('.')[:-1])
            doc_img = img_io.imread(os.path.join(gw_root_dir, 'pages', img_filename))
            # scale black pixels to 1 and white pixels to 0
            doc_img = 1 - doc_img.astype(np.float32) / 255.0
            annotation_filename = '.'.join(img_filename.split('.')[:-1] + ['gtp'])
            annotation_lines = LineListIO.read_list(os.path.join(gw_root_dir,
                                                                 'ground_truth',
                                                                 annotation_filename))
            # each line is the annotation of a word image in the following format
            #    <ul_x> <ul_y> <lr_x> <lr_y> <transcription>
            for line in annotation_lines:
                ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ')
                ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(lr_y)
                word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy()
                word_img = check_size(img=word_img,
                                      min_image_width_height=min_image_width_height)
                #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32)
                words.append((word_img, transcr, page_id))

        self.words = words
        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # extract unigrams from train split
        unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
        #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            self.word_embeddings = build_phoc_descriptor(words=word_strings,
                                                                phoc_unigrams=unigrams,
                                                                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)

        self.cv_split_method = cv_split_method
        self.cv_split_index = cv_split_idx

        #train_split = None
        #test_split = None
        if cv_split_method is not None:
            if cv_split_method == 'almazan':
                # CV splits as done in Almazan 2014
                self.split_ids = np.load(os.path.join(gw_root_dir, 'almazan_cv_indices.npy'))

            else:
                # fifepages CV
                raise NotImplementedError()
Beispiel #5
0
    def __init__(self,
                 map_root_dir1,
                 map_root_dir2,
                 all_files,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 use_bigrams=False,
                 fixed_image_size=None,
                 min_image_width_height=30,
                 is_lower=1):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks
        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError(
                'embedding must be one of phoc, tsne, spoc or dctow')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        #self.path = gw_root_dir

        #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))]
        #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))]

        #train_test_mat = scipy.io.loadmat(os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat'))

        #gt_file = os.path.join(gw_root_dir, 'info.gtp')
        words = []
        train_split_ids = []
        test_split_ids = []
        cnt = 0
        '''
        for _file in all_files:
            A = np.load(map_root_dir1+'original_images_nopad_'+_file+'.tiff.npy')
            B = np.load(map_root_dir2+'original_words_nopad_'+_file+'.tiff.npy')

            for _id in range(len(A)):
                word_img = A[_id]
                transcr = B[_id]
                word_img = 1 - word_img.astype(np.float32) / 255.0
                word_img = check_size(img=word_img, min_image_width_height=min_image_width_height)
                word_img = np.transpose(word_img, (2, 0, 1))
		if is_lower:
	                words.append((word_img, transcr.lower()))
		else:
			words.append((word_img, transcr))
	'''
        lens = []
        for _file in all_files:
            A = np.load(map_root_dir1 + _file)
            B = np.load(map_root_dir2 + _file)

            for _id in range(len(A)):
                word_img = A[_id]
                transcr = B[_id]
                word_img = 1 - word_img.astype(np.float32) / 255.0
                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                word_img = np.transpose(word_img, (2, 0, 1))
                if is_lower:
                    words.append((word_img, transcr.lower()))
                else:
                    words.append((word_img, transcr))
            lens.append(len(A))

        #self.train_ids = train_split_ids
        #self.test_ids = test_split_ids
        #ratio = 0.7
        numTrain = lens[0]
        _ids_all = range(len(words))
        #random.shuffle(_ids_all)
        train_ids = np.zeros(len(words))
        train_ids[_ids_all[0:numTrain]] = 1
        test_ids = 1 - train_ids
        self.train_ids = train_ids
        self.test_ids = test_ids
        #self.train_ids = _ids_all[0:numTrain]
        #self.test_ids = _ids_all[numTrain:]

        #self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')]
        #self.test_ids = [x[0] for x in train_test_mat.get('idxTest')]

        self.words = words

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':

            # extract unigrams
            if is_lower:
                unigrams = [
                    chr(i) for i in range(ord('a'),
                                          ord('z') + 1) +
                    range(ord('0'),
                          ord('9') + 1)
                ]
            else:
                unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \
                                   range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
        # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])

            if use_bigrams:
                bigram_levels = [2]
                bigrams = get_most_common_n_grams(word_strings)
            else:
                bigram_levels = None
                bigrams = None

            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                bigram_levels=bigram_levels,
                phoc_bigrams=bigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)