Example #1
0
def gen_text_phoc_embs(words):
    word_strings = words
    unigrams = [
        chr(i) for i in range(ord('&'),
                              ord('&') + 1) + range(ord('A'),
                                                    ord('Z') + 1) +
        range(ord('a'),
              ord('z') + 1) + range(ord('0'),
                                    ord('9') + 1)
    ]
    bigram_levels = None
    bigrams = None
    phoc_unigram_levels = (1, 2, 4, 8)

    word_var_dir, root_word_var, conf_words = create_word_variations(
        word_strings, enable_conf=True)

    embedding = build_phoc_descriptor(words=word_strings,
                                      phoc_unigrams=unigrams,
                                      bigram_levels=bigram_levels,
                                      phoc_bigrams=bigrams,
                                      unigram_levels=phoc_unigram_levels)

    word_var_strings = word_var_dir.keys()
    embedding_var = build_phoc_descriptor(words=word_var_strings,
                                          phoc_unigrams=unigrams,
                                          bigram_levels=bigram_levels,
                                          phoc_bigrams=bigrams,
                                          unigram_levels=phoc_unigram_levels)

    return (embedding, embedding_var, word_var_strings, word_var_dir,
            root_word_var, conf_words)
Example #2
0
def get_word_phoc_representations(word_strings):
    unigrams = [
        chr(i) for i in range(ord('&'),
                              ord('&') + 1) + range(ord('A'),
                                                    ord('Z') + 1) +
        range(ord('a'),
              ord('z') + 1) + range(ord('0'),
                                    ord('9') + 1)
    ]
    bigram_levels = None
    bigrams = None
    phoc_unigram_levels = (1, 2, 4, 8)
    word_var_dir, root_word_var, conf_words = create_word_variations(
        word_strings, enable_conf=True)

    word_var_strings = word_var_dir.keys()
    embedding_var = build_phoc_descriptor(words=word_var_strings,
                                          phoc_unigrams=unigrams,
                                          bigram_levels=bigram_levels,
                                          phoc_bigrams=bigrams,
                                          unigram_levels=phoc_unigram_levels)

    print('embedding variations:', embedding_var.shape)
    return (embedding_var, word_var_strings, word_var_dir, root_word_var,
            conf_words)
def gen_text_phoc_embs(words):
    word_strings = [w.lower() for w in words]
    unigrams = [
        chr(i) for i in range(ord('a'),
                              ord('z') + 1) + range(ord('0'),
                                                    ord('9') + 1)
    ]
    bigram_levels = None
    bigrams = None
    phoc_unigram_levels = (1, 2, 4, 8)

    embedding = build_phoc_descriptor(words=word_strings,
                                      phoc_unigrams=unigrams,
                                      bigram_levels=bigram_levels,
                                      phoc_bigrams=bigrams,
                                      unigram_levels=phoc_unigram_levels)

    return embedding
Example #4
0
    def __init__(self,
                 gw_root_dir,
                 image_extension='.png',
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 use_bigrams=False,
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks
        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError(
                'embedding must be one of phoc, tsne, spoc or dctow')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        self.path = gw_root_dir

        #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))]
        #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))]

        train_test_mat = scipy.io.loadmat(
            os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat'))

        gt_file = os.path.join(gw_root_dir, 'info.gtp')
        words = []
        train_split_ids = []
        test_split_ids = []
        cnt = 0
        for line in open(gt_file):
            if not line.startswith("#"):
                word_info = line.split()
                img_name = word_info[-1]
                transcr = word_info[-2]

                img_paths = img_name.split('-')
                word_img_filename = img_paths[0] + '/' + \
                                    img_paths[0] + '-' + img_paths[1] + '/' + \
                                    img_name + image_extension

                word_img_filename = os.path.join(gw_root_dir, 'words',
                                                 word_img_filename)

                if not os.path.isfile(word_img_filename):
                    continue

                # print word_img_filename
                try:
                    word_img = img_io.imread(word_img_filename)
                except:
                    continue
                # scale black pixels to 1 and white pixels to 0
                word_img = 1 - word_img.astype(np.float32) / 255.0

                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                words.append((word_img, transcr.lower()))
                '''
                if '-'.join(img_paths[:-1]) in train_img_names:
                    train_split_ids.append(1)
                else:
                    train_split_ids.append(0)
                if '-'.join(img_paths[:-1]) in test_img_names:
                    test_split_ids.append(1)
                else:
                    test_split_ids.append(0)
                cnt += 1
                '''

        #self.train_ids = train_split_ids
        #self.test_ids = test_split_ids

        self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')]
        self.test_ids = [x[0] for x in train_test_mat.get('idxTest')]

        self.words = words

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            # extract unigrams

            unigrams = [
                chr(i)
                for i in range(ord('a'),
                               ord('z') + 1) + range(ord('0'),
                                                     ord('9') + 1)
            ]
            # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
            if use_bigrams:
                bigram_levels = [2]
                bigrams = get_most_common_n_grams(word_strings)
            else:
                bigram_levels = None
                bigrams = None

            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                bigram_levels=bigram_levels,
                phoc_bigrams=bigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)
Example #5
0
    def __init__(self, gw_root_dir, image_extension='.png',
                 cv_split_method=None, cv_split_idx=None,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks

        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError('embedding must be one of phoc, spoc or dctow')
        if cv_split_method not in [None, 'almazan', 'fifepages']:
            raise ValueError('cv_split_method must be one of None, almazan or fifepages')
        if cv_split_idx is not None and cv_split_method is None:
            raise ValueError('if cv_split_idx is not None, you need to choose a cv_split_method')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        # load the dataset
        img_filenames = sorted([elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages'))
                                if elem.endswith(image_extension)])
        words = []
        for img_filename in img_filenames:
            page_id = '.'.join(img_filename.split('.')[:-1])
            doc_img = img_io.imread(os.path.join(gw_root_dir, 'pages', img_filename))
            # scale black pixels to 1 and white pixels to 0
            doc_img = 1 - doc_img.astype(np.float32) / 255.0
            annotation_filename = '.'.join(img_filename.split('.')[:-1] + ['gtp'])
            annotation_lines = LineListIO.read_list(os.path.join(gw_root_dir,
                                                                 'ground_truth',
                                                                 annotation_filename))
            # each line is the annotation of a word image in the following format
            #    <ul_x> <ul_y> <lr_x> <lr_y> <transcription>
            for line in annotation_lines:
                ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ')
                ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(lr_y)
                word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy()
                word_img = check_size(img=word_img,
                                      min_image_width_height=min_image_width_height)
                #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32)
                words.append((word_img, transcr, page_id))

        self.words = words
        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # extract unigrams from train split
        unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
        #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            self.word_embeddings = build_phoc_descriptor(words=word_strings,
                                                                phoc_unigrams=unigrams,
                                                                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)

        self.cv_split_method = cv_split_method
        self.cv_split_index = cv_split_idx

        #train_split = None
        #test_split = None
        if cv_split_method is not None:
            if cv_split_method == 'almazan':
                # CV splits as done in Almazan 2014
                self.split_ids = np.load(os.path.join(gw_root_dir, 'almazan_cv_indices.npy'))

            else:
                # fifepages CV
                raise NotImplementedError()
Example #6
0
    def __init__(self,
                 gw_root_dir,
                 image_extension='.png',
                 cv_split_method=None,
                 cv_split_idx=None,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 fixed_image_size=None,
                 min_image_width_height=30):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks

        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError('embedding must be one of phoc, spoc or dctow')
        if cv_split_method not in [None, 'almazan', 'fifepages']:
            raise ValueError(
                'cv_split_method must be one of None, almazan or fifepages')
        if cv_split_idx is not None and cv_split_method is None:
            raise ValueError(
                'if cv_split_idx is not None, you need to choose a cv_split_method'
            )

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        # load the dataset
        img_filenames = sorted([
            elem for elem in os.listdir(os.path.join(gw_root_dir, 'pages'))
            if elem.endswith(image_extension)
        ])
        words = []
        for img_filename in img_filenames:
            page_id = '.'.join(img_filename.split('.')[:-1])
            doc_img = img_io.imread(
                os.path.join(gw_root_dir, 'pages', img_filename))
            # scale black pixels to 1 and white pixels to 0
            doc_img = 1 - doc_img.astype(np.float32) / 255.0
            annotation_filename = '.'.join(
                img_filename.split('.')[:-1] + ['gtp'])
            annotation_lines = LineListIO.read_list(
                os.path.join(gw_root_dir, 'ground_truth', annotation_filename))
            # each line is the annotation of a word image in the following format
            #    <ul_x> <ul_y> <lr_x> <lr_y> <transcription>
            for line in annotation_lines:
                ul_x, ul_y, lr_x, lr_y, transcr = line.split(' ')
                ul_x, ul_y, lr_x, lr_y = int(ul_x), int(ul_y), int(lr_x), int(
                    lr_y)
                word_img = doc_img[ul_y:lr_y, ul_x:lr_x].copy()
                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                #word_img = resize(image=word_img, output_shape=[60, 100]).astype(np.float32)
                words.append((word_img, transcr, page_id))

        self.words = words
        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # extract unigrams from train split
        unigrams = [
            chr(i)
            for i in range(ord('a'),
                           ord('z') + 1) + range(ord('0'),
                                                 ord('9') + 1)
        ]
        #unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])
        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':
            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)

        self.cv_split_method = cv_split_method
        self.cv_split_index = cv_split_idx

        #train_split = None
        #test_split = None
        if cv_split_method is not None:
            if cv_split_method == 'almazan':
                # CV splits as done in Almazan 2014
                self.split_ids = np.load(
                    os.path.join(gw_root_dir, 'almazan_cv_indices.npy'))

            else:
                # fifepages CV
                raise NotImplementedError()
Example #7
0
	if is_lower == 0:
    		unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \
                    range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
	else:
    		unigrams = [chr(i) for i in range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]


	if is_lower == 1:
    		for i in range(len(word_strings)):
        		word_strings[i] = word_strings[i].lower()
	else:
    		pass

	embedding = build_phoc_descriptor(words=word_strings,
                                  phoc_unigrams=unigrams,
                                  bigram_levels=bigram_levels,
                                  phoc_bigrams=bigrams,
                                  unigram_levels=phoc_unigram_levels)

	print embedding.shape

	count, close_count, matched_words, new_outputs, new_embedding, new_word_strings, \
        qualified_ids = report_matches(outputs, embedding, 'cosine', word_strings, \
                                       original_words, k=1, length=3, is_lower=is_lower)

	print "the accuracy is: "+str(count/float(len(qualified_ids)))
	global_total += len(qualified_ids)
	global_correct += count
	#print "the close_count accuracy is: "+str(close_count/float(len(original_words)))
	
	avg_accuracy += count/float(len(qualified_ids))
Example #8
0
    def __init__(self,
                 map_root_dir1,
                 map_root_dir2,
                 all_files,
                 embedding='phoc',
                 phoc_unigram_levels=(1, 2, 4, 8),
                 use_bigrams=False,
                 fixed_image_size=None,
                 min_image_width_height=30,
                 is_lower=1):
        '''
        Constructor

        :param gw_root_dir: full path to the GW root dir
        :param image_extension: the extension of image files (default: png)
        :param transform: which transform to use on the images
        :param cv_split_method: the CV method to be used for splitting the dataset
                                if None the entire dataset is used
        :param cv_split_idx: the index of the CV split to be used
        :param partition: the partition of the dataset (train or test)
                          can only be used if cv_split_method and cv_split_idx
                          is not None
        :param min_image_width_height: the minimum height or width a word image has to have
        '''
        # sanity checks
        if embedding not in ['phoc', 'spoc', 'dctow']:
            raise ValueError(
                'embedding must be one of phoc, tsne, spoc or dctow')

        # class members
        self.word_list = None
        self.word_string_embeddings = None
        self.query_list = None
        self.label_encoder = None

        self.fixed_image_size = fixed_image_size

        #self.path = gw_root_dir

        #train_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/trainset.txt'))]
        #test_img_names = [line.strip() for line in open(os.path.join(gw_root_dir, 'old_sets/testset.txt'))]

        #train_test_mat = scipy.io.loadmat(os.path.join(gw_root_dir, 'IAM_words_indexes_sets.mat'))

        #gt_file = os.path.join(gw_root_dir, 'info.gtp')
        words = []
        train_split_ids = []
        test_split_ids = []
        cnt = 0
        '''
        for _file in all_files:
            A = np.load(map_root_dir1+'original_images_nopad_'+_file+'.tiff.npy')
            B = np.load(map_root_dir2+'original_words_nopad_'+_file+'.tiff.npy')

            for _id in range(len(A)):
                word_img = A[_id]
                transcr = B[_id]
                word_img = 1 - word_img.astype(np.float32) / 255.0
                word_img = check_size(img=word_img, min_image_width_height=min_image_width_height)
                word_img = np.transpose(word_img, (2, 0, 1))
		if is_lower:
	                words.append((word_img, transcr.lower()))
		else:
			words.append((word_img, transcr))
	'''
        lens = []
        for _file in all_files:
            A = np.load(map_root_dir1 + _file)
            B = np.load(map_root_dir2 + _file)

            for _id in range(len(A)):
                word_img = A[_id]
                transcr = B[_id]
                word_img = 1 - word_img.astype(np.float32) / 255.0
                word_img = check_size(
                    img=word_img,
                    min_image_width_height=min_image_width_height)
                word_img = np.transpose(word_img, (2, 0, 1))
                if is_lower:
                    words.append((word_img, transcr.lower()))
                else:
                    words.append((word_img, transcr))
            lens.append(len(A))

        #self.train_ids = train_split_ids
        #self.test_ids = test_split_ids
        #ratio = 0.7
        numTrain = lens[0]
        _ids_all = range(len(words))
        #random.shuffle(_ids_all)
        train_ids = np.zeros(len(words))
        train_ids[_ids_all[0:numTrain]] = 1
        test_ids = 1 - train_ids
        self.train_ids = train_ids
        self.test_ids = test_ids
        #self.train_ids = _ids_all[0:numTrain]
        #self.test_ids = _ids_all[numTrain:]

        #self.train_ids = [x[0] for x in train_test_mat.get('idxTrain')]
        #self.test_ids = [x[0] for x in train_test_mat.get('idxTest')]

        self.words = words

        # compute a mapping from class string to class id
        self.label_encoder = LabelEncoder()
        self.label_encoder.fit([elem[1] for elem in words])

        # create embedding for the word_list
        self.word_embeddings = None
        word_strings = [elem[1] for elem in words]
        if embedding == 'phoc':

            # extract unigrams
            if is_lower:
                unigrams = [
                    chr(i) for i in range(ord('a'),
                                          ord('z') + 1) +
                    range(ord('0'),
                          ord('9') + 1)
                ]
            else:
                unigrams = [chr(i) for i in range(ord('&'), ord('&')+1) + range(ord('A'), ord('Z')+1) + \
                                   range(ord('a'), ord('z') + 1) + range(ord('0'), ord('9') + 1)]
        # unigrams = get_unigrams_from_strings(word_strings=[elem[1] for elem in words])

            if use_bigrams:
                bigram_levels = [2]
                bigrams = get_most_common_n_grams(word_strings)
            else:
                bigram_levels = None
                bigrams = None

            self.word_embeddings = build_phoc_descriptor(
                words=word_strings,
                phoc_unigrams=unigrams,
                bigram_levels=bigram_levels,
                phoc_bigrams=bigrams,
                unigram_levels=phoc_unigram_levels)
        elif embedding == 'spoc':
            raise NotImplementedError()
        else:
            # dctow
            raise NotImplementedError()
        self.word_embeddings = self.word_embeddings.astype(np.float32)