def __init__(self, gw_root_dir, image_extension='.png', embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), use_bigrams=False, fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError( 'embedding must be one of phoc, tsne, spoc or dctow') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size self.path = gw_root_dir #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))] #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))] train_test_mat = scipy.io.loadmat( os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat')) gt_file = os.path.join(gw_root_dir, 'info.gtp') words = [] train_split_ids = [] test_split_ids = [] cnt = 0 for line in open(gt_file): if not line.startswith("#"): word_info = line.split() img_name = word_info[-1] transcr = word_info[-2] img_paths = img_name.split('-') word_img_filename = img_paths[0] + '/' + \ img_paths[0] + '-' + img_paths[1] + '/' + \ img_name + image_extension word_img_filename = os.path.join(gw_root_dir, 'words', word_img_filename) if not os.path.isfile(word_img_filename): continue # print word_img_filename try: word_img = img_io.imread(word_img_filename) except: continue # scale black pixels to 1 and white pixels to 0 word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) words.append((word_img, transcr.lower())) ''' if '-'.join(img_paths[:-1]) in train_img_names: train_split_ids.append(1) else: train_split_ids.append(0) if '-'.join(img_paths[:-1]) in test_img_names: test_split_ids.append(1) else: test_split_ids.append(0) cnt += 1 ''' #self.train_ids = train_split_ids #self.test_ids = test_split_ids self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')] self.test_ids = [x[0] for x in train_test_mat.get('idxTest')] self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': # extract unigrams unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) if use_bigrams: bigram_levels = [2] bigrams = get_most_common_n_grams(word_strings) else: bigram_levels = None bigrams = None self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32)
def __init__( self, phoc_layout: PhocLayout, root_dir='data/', embedding='phoc', min_image_width_height=30, fixed_image_size=None, max_wordlength=20, ): ''' We need to fill in: self.words list of tuples: (word_img, transcr, page_id). word_img is an intensity matrix, transcr is a string with the transcription, page_id holds word info (optional?) self.split_ids list of ids: tag each word with a partition label (here, training=1, validation=2, test=3) self.word_embeddings list of targets that correspond to the words (PHOC embeddings or word lengths) To be filled-in automatically: self.label_encoder compute a mapping from class string to class id. Initialize after filling-in self.words. self.query_list this is defined in MainLoader. ''' def xml2jpg(xml): base, ext = os.path.splitext(xml) return base + '.JPG' self.TRAINING_PARTITION = 1 self.VALIDATION_PARTITION = 2 self.TEST_PARTITION = 3 if embedding not in ['phoc', 'wordlength']: raise ValueError('embedding must be either phoc or wordlength') self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size # Specify images of the set all_xmls = [] for x in range(1, 48): if x == 12: continue #Page 12 was omitted / doesn't exist all_xmls.append(os.path.join(root_dir, '_00{0:02d}.xml'.format(x))) # load the dataset self.words = [] self.split_ids = [] word_id = 1 for page_id in all_xmls: doc_img = img_io.imread(xml2jpg(page_id)) doc_img = np.mean(doc_img, axis=2) # inputs of sophia are colour doc_img = 1 - doc_img.astype( np.float32 ) / 255.0 # scale black pixels to 1 and white pixels to 0 for word in get_words_from_pagexml(page_id): x, y, w, h = word[1] word_img = doc_img[y:y + h, x:x + w].copy() word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) # Decide on split_id (this comes from footnote on page 3 of Sfikas et al.2015) if word_id >= 1 and word_id <= 2000: current_split_id = self.TRAINING_PARTITION elif word_id >= 2001 and word_id <= 4000: current_split_id = self.TEST_PARTITION elif word_id >= 4001 and word_id <= 4941: current_split_id = self.VALIDATION_PARTITION else: raise ValueError( 'Word id read out of bounds (={}); it should have been in [1,4941].' .format(current_split_id)) transcr = word[2] self.words.append((word_img, transcr, page_id)) self.split_ids.append(current_split_id) word_id += 1 self.label_encoder = LabelEncoder() word_strings = [elem[1] for elem in self.words] self.label_encoder.fit(word_strings) self.word_embeddings = None if embedding == 'phoc': self.word_embeddings = phoc_layout.build_phoc_descriptor( word_strings) elif embedding == 'wordlength': self.word_embeddings = [] for x in word_strings: tt = np.zeros([ max_wordlength, ]) try: tt[len(x) - 1] = 1 except IndexError: print( 'Word length (for word "{}") over max word length ({})' .format(x, max_wordlength)) exit(1) self.word_embeddings.append(tt) self.word_embeddings = np.array(self.word_embeddings) else: raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32)
def __init__(self, gw_root_dir, image_extension='.png', cv_split_method=None, cv_split_idx=None, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError('embedding must be one of phoc, spoc or dctow') if cv_split_method not in [None, 'almazan', 'fifepages']: raise ValueError( 'cv_split_method must be one of None, almazan or fifepages') if cv_split_idx is not None and cv_split_method is None: raise ValueError( 'if cv_split_idx is not None, you need to choose a cv_split_method' ) # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size # load the dataset img_filenames = sorted([ elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages')) if elem.endswith(image_extension) ]) words = [] for img_filename in img_filenames: page_id = '.'.join(img_filename.split('.')[:-1]) doc_img = img_io.imread( os.path.join(gw_root_dir, 'pages', img_filename)) # scale black pixels to 1 and white pixels to 0 doc_img = 1 - doc_img.astype(np.float32) / 255.0 annotation_filename = '.'.join( img_filename.split('.')[:-1] + ['gtp']) annotation_lines = LineListIO.read_list( os.path.join(gw_root_dir, 'ground_truth', annotation_filename)) # each line is the annotation of a word image in the following format # <ul_x> <ul_y> <lr_x> <lr_y> <transcription> for line in annotation_lines: ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ') ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int( lr_y) word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy() word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32) words.append((word_img, transcr, page_id)) self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # extract unigrams from train split unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32) self.cv_split_method = cv_split_method self.cv_split_index = cv_split_idx #train_split = None #test_split = None if cv_split_method is not None: if cv_split_method == 'almazan': # CV splits as done in Almazan 2014 self.split_ids = np.load( os.path.join(gw_root_dir, 'almazan_cv_indices.npy')) else: # fifepages CV raise NotImplementedError()
def __init__(self, gw_root_dir, image_extension='.png', cv_split_method=None, cv_split_idx=None, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError('embedding must be one of phoc, spoc or dctow') if cv_split_method not in [None, 'almazan', 'fifepages']: raise ValueError('cv_split_method must be one of None, almazan or fifepages') if cv_split_idx is not None and cv_split_method is None: raise ValueError('if cv_split_idx is not None, you need to choose a cv_split_method') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size # load the dataset img_filenames = sorted([elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages')) if elem.endswith(image_extension)]) words = [] for img_filename in img_filenames: page_id = '.'.join(img_filename.split('.')[:-1]) doc_img = img_io.imread(os.path.join(gw_root_dir, 'pages', img_filename)) # scale black pixels to 1 and white pixels to 0 doc_img = 1 - doc_img.astype(np.float32) / 255.0 annotation_filename = '.'.join(img_filename.split('.')[:-1] + ['gtp']) annotation_lines = LineListIO.read_list(os.path.join(gw_root_dir, 'ground_truth', annotation_filename)) # each line is the annotation of a word image in the following format # <ul_x> <ul_y> <lr_x> <lr_y> <transcription> for line in annotation_lines: ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ') ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(lr_y) word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy() word_img = check_size(img=word_img, min_image_width_height=min_image_width_height) #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32) words.append((word_img, transcr, page_id)) self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # extract unigrams from train split unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': self.word_embeddings = build_phoc_descriptor(words=word_strings, phoc_unigrams=unigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32) self.cv_split_method = cv_split_method self.cv_split_index = cv_split_idx #train_split = None #test_split = None if cv_split_method is not None: if cv_split_method == 'almazan': # CV splits as done in Almazan 2014 self.split_ids = np.load(os.path.join(gw_root_dir, 'almazan_cv_indices.npy')) else: # fifepages CV raise NotImplementedError()
def __init__(self, map_root_dir1, map_root_dir2, all_files, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), use_bigrams=False, fixed_image_size=None, min_image_width_height=30, is_lower=1): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError( 'embedding must be one of phoc, tsne, spoc or dctow') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size #self.path = gw_root_dir #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))] #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))] #train_test_mat = scipy.io.loadmat(os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat')) #gt_file = os.path.join(gw_root_dir, 'info.gtp') words = [] train_split_ids = [] test_split_ids = [] cnt = 0 ''' for _file in all_files: A = np.load(map_root_dir1+'original_images_nopad_'+_file+'.tiff.npy') B = np.load(map_root_dir2+'original_words_nopad_'+_file+'.tiff.npy') for _id in range(len(A)): word_img = A[_id] transcr = B[_id] word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size(img=word_img, min_image_width_height=min_image_width_height) word_img = np.transpose(word_img, (2, 0, 1)) if is_lower: words.append((word_img, transcr.lower())) else: words.append((word_img, transcr)) ''' lens = [] for _file in all_files: A = np.load(map_root_dir1 + _file) B = np.load(map_root_dir2 + _file) for _id in range(len(A)): word_img = A[_id] transcr = B[_id] word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) word_img = np.transpose(word_img, (2, 0, 1)) if is_lower: words.append((word_img, transcr.lower())) else: words.append((word_img, transcr)) lens.append(len(A)) #self.train_ids = train_split_ids #self.test_ids = test_split_ids #ratio = 0.7 numTrain = lens[0] _ids_all = range(len(words)) #random.shuffle(_ids_all) train_ids = np.zeros(len(words)) train_ids[_ids_all[0:numTrain]] = 1 test_ids = 1 - train_ids self.train_ids = train_ids self.test_ids = test_ids #self.train_ids = _ids_all[0:numTrain] #self.test_ids = _ids_all[numTrain:] #self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')] #self.test_ids = [x[0] for x in train_test_mat.get('idxTest')] self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': # extract unigrams if is_lower: unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] else: unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \ range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) if use_bigrams: bigram_levels = [2] bigrams = get_most_common_n_grams(word_strings) else: bigram_levels = None bigrams = None self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32)