Ejemplo n.º 1
0
    def load_data(self):
        data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8')

        # Get a list of the unique characters in the text
        vocab = list(sorted(set(data)))
        vocab_size = len(vocab)

        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences (+1 since the targets are shifted by one)
        sequences_ds = Dataset.from_tensor_slices(ids_of_chars)
        sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1)

        # Batch the sequences
        ds = sequences_ds.padded_batch(C.BATCH_SIZE)
        ds = ds.map(self._to_inputs_and_targets,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.shuffle(C.BUFFER_SIZE)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

        return ds
Ejemplo n.º 2
0
    def get_svg_ds(self):
        data = GFile('datasets/svgs/simpleline.svg',
                     'rb').read().decode(encoding='UTF-8')

        # Get the list of the unique characters in the text
        vocab = ['e', 'g', 'n', 'r', '\n']
        vocab_size = len(vocab)

        # Build the id to char lookup table
        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences
        svg_ds = Dataset.from_tensor_slices(ids_of_chars)
        svg_ds = svg_ds.batch(C.SEQUENCE_LENGTH)
        svg_ds = svg_ds.batch(C.BATCH_SIZE)

        return svg_ds
Ejemplo n.º 3
0
class OCR:
    def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1):
        self.scale_ratio = scale_ratio
        self.characters = sorted([
            *set("".join(
                sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames +
                list(ArtsInfo.MainAttrNames.values()) +
                list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789")))
        ])
        # Mapping characters to integers
        self.char_to_num = StringLookup(vocabulary=list(self.characters),
                                        num_oov_indices=0,
                                        mask_token="")

        # Mapping integers back to original characters
        self.num_to_char = StringLookup(
            vocabulary=self.char_to_num.get_vocabulary(),
            oov_token="",
            mask_token="",
            invert=True)

        self.width = 240
        self.height = 16
        self.max_length = 15
        self.build_model(input_shape=(self.width, self.height))
        self.model.load_weights(model_weight)

    def detect_info(self, art_img):
        info = self.extract_art_info(art_img)
        x = np.concatenate([
            self.preprocess(info[key]).T[None, :, :, None]
            for key in sorted(info.keys())
        ],
                           axis=0)
        y = self.model.predict(x)
        y = self.decode(y)
        return {
            **{key: v
               for key, v in zip(sorted(info.keys()), y)},
            **{
                'star': self.detect_star(art_img)
            }
        }

    def extract_art_info(self, art_img):
        name = art_img.crop([i * self.scale_ratio for i in Config.name_coords])
        type = art_img.crop([i * self.scale_ratio for i in Config.type_coords])
        main_attr_name = art_img.crop(
            [i * self.scale_ratio for i in Config.main_attr_name_coords])
        main_attr_value = art_img.crop(
            [i * self.scale_ratio for i in Config.main_attr_value_coords])
        level = art_img.crop(
            [i * self.scale_ratio for i in Config.level_coords])
        subattr_1 = art_img.crop([
            i * self.scale_ratio for i in Config.subattr_1_coords
        ])  # [73, 83, 102]
        subattr_2 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_2_coords])
        subattr_3 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_3_coords])
        subattr_4 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_4_coords])
        if np.all(
                np.abs(np.array(subattr_1, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_1
            del subattr_2
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_2, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_2
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_3, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_4, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_4
        return {
            key: value
            for key, value in locals().items()
            if key not in ['art_img', 'self']
        }

    def detect_star(self, art_img):
        star = art_img.crop([i * self.scale_ratio for i in Config.star_coords])
        cropped_star = self.crop(self.normalize(self.to_gray(star)))
        coef = cropped_star.shape[1] / cropped_star.shape[0]
        coef = coef / 1.30882352 + 0.21568627
        return int(round(coef))

    def to_gray(self, text_img):
        text_img = np.array(text_img)
        if len(text_img.shape) > 2:
            text_img = (
                text_img[..., :3] @ [[[0.299], [0.587], [0.114]]])[:, :, 0]
        return np.array(text_img, np.float32)

    def normalize(self, img, auto_inverse=True):
        img -= img.min()
        img /= img.max()
        if auto_inverse and img[-1, -1] > 0.5:
            img = 1 - img
        return img

    def crop(self, img, tol=0.7):
        # img is 2D image data
        # tol  is tolerance
        mask = img > tol
        m, n = img.shape
        mask0, mask1 = mask.any(0), mask.any(1)
        col_start, col_end = mask0.argmax(), n - mask0[::-1].argmax()
        row_start, row_end = mask1.argmax(), m - mask1[::-1].argmax()
        #     print(row_end-row_start, col_end-col_start)
        return img[row_start:row_end, col_start:col_end]

    def resize_to_height(self, img):
        height = self.height
        return (np.array(
            Image.fromarray(np.uint8(img * 255)).resize(
                (int(img.shape[1] * height / img.shape[0]), height),
                Image.BILINEAR,
            )) / 255)

    def pad_to_width(self, img):
        width = self.width
        if img.shape[1] >= width:
            return img[:, :width]
        return np.pad(img, [[0, 0], [0, width - img.shape[1]]],
                      mode="constant",
                      constant_values=0)

    def preprocess(self, text_img):
        result = self.to_gray(text_img)
        result = self.normalize(result, True)
        result = self.crop(result)
        result = self.normalize(result, False)
        result = self.resize_to_height(result)
        result = self.pad_to_width(result)
        return result

    def decode(self, pred):
        input_len = np.ones(pred.shape[0]) * pred.shape[1]
        # Use greedy search. For complex tasks, you can use beam search
        results = ctc_decode(pred, input_length=input_len,
                             greedy=True)[0][0][:, :self.max_length]
        # Iterate over the results and get back the text
        output_text = []
        for res in results:
            res = self.num_to_char(res)
            res = reduce_join(res)
            res = res.numpy().decode("utf-8")
            output_text.append(res)
        return output_text

    def build_model(self, input_shape):
        input_img = Input(shape=(input_shape[0], input_shape[1], 1),
                          name="image",
                          dtype="float32")
        mobilenet = MobileNetV3_Small((input_shape[0], input_shape[1], 1),
                                      0,
                                      alpha=1.0,
                                      include_top=False).build()
        x = mobilenet(input_img)
        new_shape = ((input_shape[0] // 8), (input_shape[1] // 8) * 576)
        x = Reshape(target_shape=new_shape, name="reshape")(x)
        x = Dense(64, activation="relu", name="dense1")(x)
        x = Dropout(0.2)(x)

        # RNNs
        x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)
        x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x)

        # Output layer
        output = Dense(len(self.characters) + 2,
                       activation="softmax",
                       name="dense2")(x)

        # Define the model
        self.model = Model(inputs=[input_img],
                           outputs=output,
                           name="ocr_model_v1")
Keras provides different preprocessing layers to deal with different modalities of data.
[This guide](https://keras.io/guides/preprocessing_layers/) provids a comprehensive introduction.
Our example involves preprocessing labels at the character
level. This means that if there are two labels, e.g. "cat" and "dog", then our character
vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the
[`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/)
layer for this purpose.
"""

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(),
                           mask_token=None,
                           invert=True)
"""
### Resizing images without distortion

Instead of square images, many OCR models work with rectangular images. This will become
clearer in a moment when we will visualize a few samples from the dataset. While
aspect-unaware resizing square images does not introduce a significant amount of
distortion this is not the case for rectangular images. But resizing images to a uniform
size is a requirement for mini-batching. So we need to perform our resizing such that
the following criteria are met:

* Aspect ratio is preserved.
* Content of the images is not affected.
"""