def gen_text_phoc_embs(words): word_strings = words unigrams = [ chr(i) for i in range(ord('&'), ord('&') + 1) + range(ord('A'), ord('Z') + 1) + range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] bigram_levels = None bigrams = None phoc_unigram_levels = (1, 2, 4, 8) word_var_dir, root_word_var, conf_words = create_word_variations( word_strings, enable_conf=True) embedding = build_phoc_descriptor(words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) word_var_strings = word_var_dir.keys() embedding_var = build_phoc_descriptor(words=word_var_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) return (embedding, embedding_var, word_var_strings, word_var_dir, root_word_var, conf_words)
def get_word_phoc_representations(word_strings): unigrams = [ chr(i) for i in range(ord('&'), ord('&') + 1) + range(ord('A'), ord('Z') + 1) + range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] bigram_levels = None bigrams = None phoc_unigram_levels = (1, 2, 4, 8) word_var_dir, root_word_var, conf_words = create_word_variations( word_strings, enable_conf=True) word_var_strings = word_var_dir.keys() embedding_var = build_phoc_descriptor(words=word_var_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) print('embedding variations:', embedding_var.shape) return (embedding_var, word_var_strings, word_var_dir, root_word_var, conf_words)
def gen_text_phoc_embs(words): word_strings = [w.lower() for w in words] unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] bigram_levels = None bigrams = None phoc_unigram_levels = (1, 2, 4, 8) embedding = build_phoc_descriptor(words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) return embedding
def __init__(self, gw_root_dir, image_extension='.png', embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), use_bigrams=False, fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError( 'embedding must be one of phoc, tsne, spoc or dctow') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size self.path = gw_root_dir #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))] #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))] train_test_mat = scipy.io.loadmat( os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat')) gt_file = os.path.join(gw_root_dir, 'info.gtp') words = [] train_split_ids = [] test_split_ids = [] cnt = 0 for line in open(gt_file): if not line.startswith("#"): word_info = line.split() img_name = word_info[-1] transcr = word_info[-2] img_paths = img_name.split('-') word_img_filename = img_paths[0] + '/' + \ img_paths[0] + '-' + img_paths[1] + '/' + \ img_name + image_extension word_img_filename = os.path.join(gw_root_dir, 'words', word_img_filename) if not os.path.isfile(word_img_filename): continue # print word_img_filename try: word_img = img_io.imread(word_img_filename) except: continue # scale black pixels to 1 and white pixels to 0 word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) words.append((word_img, transcr.lower())) ''' if '-'.join(img_paths[:-1]) in train_img_names: train_split_ids.append(1) else: train_split_ids.append(0) if '-'.join(img_paths[:-1]) in test_img_names: test_split_ids.append(1) else: test_split_ids.append(0) cnt += 1 ''' #self.train_ids = train_split_ids #self.test_ids = test_split_ids self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')] self.test_ids = [x[0] for x in train_test_mat.get('idxTest')] self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': # extract unigrams unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) if use_bigrams: bigram_levels = [2] bigrams = get_most_common_n_grams(word_strings) else: bigram_levels = None bigrams = None self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32)
def __init__(self, gw_root_dir, image_extension='.png', cv_split_method=None, cv_split_idx=None, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError('embedding must be one of phoc, spoc or dctow') if cv_split_method not in [None, 'almazan', 'fifepages']: raise ValueError('cv_split_method must be one of None, almazan or fifepages') if cv_split_idx is not None and cv_split_method is None: raise ValueError('if cv_split_idx is not None, you need to choose a cv_split_method') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size # load the dataset img_filenames = sorted([elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages')) if elem.endswith(image_extension)]) words = [] for img_filename in img_filenames: page_id = '.'.join(img_filename.split('.')[:-1]) doc_img = img_io.imread(os.path.join(gw_root_dir, 'pages', img_filename)) # scale black pixels to 1 and white pixels to 0 doc_img = 1 - doc_img.astype(np.float32) / 255.0 annotation_filename = '.'.join(img_filename.split('.')[:-1] + ['gtp']) annotation_lines = LineListIO.read_list(os.path.join(gw_root_dir, 'ground_truth', annotation_filename)) # each line is the annotation of a word image in the following format # <ul_x> <ul_y> <lr_x> <lr_y> <transcription> for line in annotation_lines: ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ') ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(lr_y) word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy() word_img = check_size(img=word_img, min_image_width_height=min_image_width_height) #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32) words.append((word_img, transcr, page_id)) self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # extract unigrams from train split unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': self.word_embeddings = build_phoc_descriptor(words=word_strings, phoc_unigrams=unigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32) self.cv_split_method = cv_split_method self.cv_split_index = cv_split_idx #train_split = None #test_split = None if cv_split_method is not None: if cv_split_method == 'almazan': # CV splits as done in Almazan 2014 self.split_ids = np.load(os.path.join(gw_root_dir, 'almazan_cv_indices.npy')) else: # fifepages CV raise NotImplementedError()
def __init__(self, gw_root_dir, image_extension='.png', cv_split_method=None, cv_split_idx=None, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), fixed_image_size=None, min_image_width_height=30): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError('embedding must be one of phoc, spoc or dctow') if cv_split_method not in [None, 'almazan', 'fifepages']: raise ValueError( 'cv_split_method must be one of None, almazan or fifepages') if cv_split_idx is not None and cv_split_method is None: raise ValueError( 'if cv_split_idx is not None, you need to choose a cv_split_method' ) # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size # load the dataset img_filenames = sorted([ elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages')) if elem.endswith(image_extension) ]) words = [] for img_filename in img_filenames: page_id = '.'.join(img_filename.split('.')[:-1]) doc_img = img_io.imread( os.path.join(gw_root_dir, 'pages', img_filename)) # scale black pixels to 1 and white pixels to 0 doc_img = 1 - doc_img.astype(np.float32) / 255.0 annotation_filename = '.'.join( img_filename.split('.')[:-1] + ['gtp']) annotation_lines = LineListIO.read_list( os.path.join(gw_root_dir, 'ground_truth', annotation_filename)) # each line is the annotation of a word image in the following format # <ul_x> <ul_y> <lr_x> <lr_y> <transcription> for line in annotation_lines: ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ') ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int( lr_y) word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy() word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32) words.append((word_img, transcr, page_id)) self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # extract unigrams from train split unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32) self.cv_split_method = cv_split_method self.cv_split_index = cv_split_idx #train_split = None #test_split = None if cv_split_method is not None: if cv_split_method == 'almazan': # CV splits as done in Almazan 2014 self.split_ids = np.load( os.path.join(gw_root_dir, 'almazan_cv_indices.npy')) else: # fifepages CV raise NotImplementedError()
if is_lower == 0: unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \ range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] else: unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] if is_lower == 1: for i in range(len(word_strings)): word_strings[i] = word_strings[i].lower() else: pass embedding = build_phoc_descriptor(words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) print embedding.shape count, close_count, matched_words, new_outputs, new_embedding, new_word_strings, \ qualified_ids = report_matches(outputs, embedding, 'cosine', word_strings, \ original_words, k=1, length=3, is_lower=is_lower) print "the accuracy is: "+str(count/float(len(qualified_ids))) global_total += len(qualified_ids) global_correct += count #print "the close_count accuracy is: "+str(close_count/float(len(original_words))) avg_accuracy += count/float(len(qualified_ids))
def __init__(self, map_root_dir1, map_root_dir2, all_files, embedding='phoc', phoc_unigram_levels=(1, 2, 4, 8), use_bigrams=False, fixed_image_size=None, min_image_width_height=30, is_lower=1): ''' Constructor :param gw_root_dir: full path to the GW root dir :param image_extension: the extension of image files (default: png) :param transform: which transform to use on the images :param cv_split_method: the CV method to be used for splitting the dataset if None the entire dataset is used :param cv_split_idx: the index of the CV split to be used :param partition: the partition of the dataset (train or test) can only be used if cv_split_method and cv_split_idx is not None :param min_image_width_height: the minimum height or width a word image has to have ''' # sanity checks if embedding not in ['phoc', 'spoc', 'dctow']: raise ValueError( 'embedding must be one of phoc, tsne, spoc or dctow') # class members self.word_list = None self.word_string_embeddings = None self.query_list = None self.label_encoder = None self.fixed_image_size = fixed_image_size #self.path = gw_root_dir #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))] #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))] #train_test_mat = scipy.io.loadmat(os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat')) #gt_file = os.path.join(gw_root_dir, 'info.gtp') words = [] train_split_ids = [] test_split_ids = [] cnt = 0 ''' for _file in all_files: A = np.load(map_root_dir1+'original_images_nopad_'+_file+'.tiff.npy') B = np.load(map_root_dir2+'original_words_nopad_'+_file+'.tiff.npy') for _id in range(len(A)): word_img = A[_id] transcr = B[_id] word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size(img=word_img, min_image_width_height=min_image_width_height) word_img = np.transpose(word_img, (2, 0, 1)) if is_lower: words.append((word_img, transcr.lower())) else: words.append((word_img, transcr)) ''' lens = [] for _file in all_files: A = np.load(map_root_dir1 + _file) B = np.load(map_root_dir2 + _file) for _id in range(len(A)): word_img = A[_id] transcr = B[_id] word_img = 1 - word_img.astype(np.float32) / 255.0 word_img = check_size( img=word_img, min_image_width_height=min_image_width_height) word_img = np.transpose(word_img, (2, 0, 1)) if is_lower: words.append((word_img, transcr.lower())) else: words.append((word_img, transcr)) lens.append(len(A)) #self.train_ids = train_split_ids #self.test_ids = test_split_ids #ratio = 0.7 numTrain = lens[0] _ids_all = range(len(words)) #random.shuffle(_ids_all) train_ids = np.zeros(len(words)) train_ids[_ids_all[0:numTrain]] = 1 test_ids = 1 - train_ids self.train_ids = train_ids self.test_ids = test_ids #self.train_ids = _ids_all[0:numTrain] #self.test_ids = _ids_all[numTrain:] #self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')] #self.test_ids = [x[0] for x in train_test_mat.get('idxTest')] self.words = words # compute a mapping from class string to class id self.label_encoder = LabelEncoder() self.label_encoder.fit([elem[1] for elem in words]) # create embedding for the word_list self.word_embeddings = None word_strings = [elem[1] for elem in words] if embedding == 'phoc': # extract unigrams if is_lower: unigrams = [ chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1) ] else: unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \ range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)] # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words]) if use_bigrams: bigram_levels = [2] bigrams = get_most_common_n_grams(word_strings) else: bigram_levels = None bigrams = None self.word_embeddings = build_phoc_descriptor( words=word_strings, phoc_unigrams=unigrams, bigram_levels=bigram_levels, phoc_bigrams=bigrams, unigram_levels=phoc_unigram_levels) elif embedding == 'spoc': raise NotImplementedError() else: # dctow raise NotImplementedError() self.word_embeddings = self.word_embeddings.astype(np.float32)