Exemple #1
0
    def setupCTC(self):
        "create CTC loss and decoder and return them"
        # BxTxC -> TxBxC
        self.ctcIn3dTBC = tf.transpose(a=self.rnnOut3d, perm=[1, 0, 2])
        # ground truth text as sparse tensor
        self.gtTexts = tf.SparseTensor(
            tf.compat.v1.placeholder(tf.int64, shape=[None, 2]),
            tf.compat.v1.placeholder(tf.int32, [None]),
            tf.compat.v1.placeholder(tf.int64, [2]))

        # calc loss for batch
        self.seqLen = tf.compat.v1.placeholder(tf.int32, [None])
        self.loss = tf.reduce_mean(
            input_tensor=tf.compat.v1.nn.ctc_loss(labels=self.gtTexts,
                                                  inputs=self.ctcIn3dTBC,
                                                  sequence_length=self.seqLen,
                                                  ctc_merge_repeated=True))

        # calc loss for each element to compute label probability
        self.savedCtcInput = tf.compat.v1.placeholder(
            tf.float32, shape=[Model.maxTextLen, None,
                               len(self.charList) + 1])
        self.lossPerElement = tf.compat.v1.nn.ctc_loss(
            labels=self.gtTexts,
            inputs=self.savedCtcInput,
            sequence_length=self.seqLen,
            ctc_merge_repeated=True)

        # best path decoding or beam search decoding
        if self.decoderType == DecoderType.BestPath:
            self.decoder = tf.nn.ctc_greedy_decoder(
                inputs=self.ctcIn3dTBC, sequence_length=self.seqLen)
        elif self.decoderType == DecoderType.BeamSearch:
            self.decoder = tf.nn.ctc_beam_search_decoder(
                inputs=self.ctcIn3dTBC,
                sequence_length=self.seqLen,
                beam_width=50)
        # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch)
        elif self.decoderType == DecoderType.WordBeamSearch:
            # prepare information about language (dictionary, characters in dataset, characters forming words)
            chars = str().join(self.charList)
            wordChars = open(
                '../model/wordCharList.txt').read().splitlines()[0]
            corpus = open('../data/corpus.txt').read()

            # decode using the "Words" mode of word beam search
            from word_beam_search import WordBeamSearch
            self.decoder = WordBeamSearch(50, 'Words', 0.0,
                                          corpus.encode('utf8'),
                                          chars.encode('utf8'),
                                          wordChars.encode('utf8'))

            # the input to the decoder must have softmax already applied
            self.wbsInput = tf.nn.softmax(self.ctcIn3dTBC, axis=2)
def testPyBind(feedMat, corpus, chars, wordChars):
    "decode using word beam search. Result is tuple, first entry is label string, second entry is char string."

    # decode using the "Words" mode of word beam search with beam width set to 25 and add-k smoothing to 0.0
    assert len(chars) + 1 == feedMat.shape[2]

    wbs = WordBeamSearch(25, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8'))
    res = wbs.compute(feedMat)

    # result is string of labels terminated by blank (similar to C-strings) if shorter than T
    blank = len(chars)
    s = ''
    for label in res[0]:
        if label == blank:
            break
        s += chars[label]  # map label to char
    return res[0], s
Exemple #3
0
    def __init__(self,
                 charList,
                 decoderType=DecoderType.BestPath,
                 mustRestore=False,
                 dump=False):
        "init model: add CNN, RNN and CTC and initialize TF"
        self.dump = dump
        self.charList = charList
        self.decoderType = decoderType
        self.mustRestore = mustRestore
        self.snapID = 0
        ##creating a WordbeamObject
        chars = str().join(self.charList)
        wordChars = open('../model/wordCharList.txt').read().splitlines()[0]
        corpus = open('../data/corpus.txt').read()
        self.wbs = WordBeamSearch(25, 'Words', 0.0, corpus.encode('utf8'),
                                  chars.encode('utf8'),
                                  wordChars.encode('utf8'))

        # Whether to use normalization over a batch or a population
        self.is_train = tf.placeholder(tf.bool, name='is_train')

        # input image batch
        self.inputImgs = tf.placeholder(tf.float32,
                                        shape=(None, Model.imgSize[0],
                                               Model.imgSize[1]))

        # setup CNN, RNN and CTC
        self.setupCNN()
        self.setupRNN()
        self.setupCTC()

        # setup optimizer to train NN
        self.batchesTrained = 0
        self.learningRate = tf.placeholder(tf.float32, shape=[])
        self.update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(self.update_ops):
            self.optimizer = tf.train.RMSPropOptimizer(
                self.learningRate).minimize(self.loss)

        # initialize TF
        (self.sess, self.saver) = self.setupTF()
Exemple #4
0
def testPyBind(feedMat, corpus, chars, wordChars):
    "decode using word beam search. Result is tuple, first entry is label string, second entry is char string."

    # decode using the "Words" mode of word beam search with beam width set to 25 and add-k smoothing to 0.0
    assert len(chars) + 1 == feedMat.shape[2]

    print(feedMat.shape)
    
    start = time.perf_counter()
    wbs = WordBeamSearch(15, 'Words', 0.0, corpus.encode('utf8'), chars.encode('utf8'), wordChars.encode('utf8'))
    print('Create WBS:', time.perf_counter() - start)
    start = time.perf_counter()
    res = wbs.compute(feedMat)
    print('Compute 1:', time.perf_counter() - start)
    start = time.perf_counter()
    feedMat = np.concatenate((feedMat, feedMat, feedMat, feedMat, feedMat), axis=1)
    res = wbs.compute(feedMat)
    print('Compute 2:', time.perf_counter() - start)
    start = time.perf_counter()
    feedMat = np.concatenate((feedMat, feedMat), axis=1)
    res = wbs.compute(feedMat)
    print('Compute 3:', time.perf_counter() - start)
    start = time.perf_counter()
    feedMat = np.concatenate((feedMat, feedMat), axis=1)
    res = wbs.compute(feedMat)
    print('Compute 4:', time.perf_counter() - start)
    start = time.perf_counter()
    feedMat = np.concatenate((feedMat, feedMat), axis=1)
    res = wbs.compute(feedMat)
    print('Compute 5:', time.perf_counter() - start)
    start = time.perf_counter()
    feedMat = np.concatenate((feedMat, feedMat), axis=1)
    res = wbs.compute(feedMat)
    print('Compute 6:', time.perf_counter() - start)

    # result is string of labels terminated by blank (similar to C-strings) if shorter than T
    blank = len(chars)
    s = ''
    for label in res[0]:
        if label == blank:
            break
        s += chars[label]  # map label to char
    return res[0], s
Exemple #5
0
class Model:
    "minimalistic TF model for HTR"

    # model constants
    imgSize = (128, 32)
    maxTextLen = 32

    def __init__(self,
                 charList,
                 decoderType=DecoderType.BestPath,
                 mustRestore=False,
                 dump=False):
        "init model: add CNN, RNN and CTC and initialize TF"
        self.dump = dump
        self.charList = charList
        self.decoderType = decoderType
        self.mustRestore = mustRestore
        self.snapID = 0

        # Whether to use normalization over a batch or a population
        self.is_train = tf.compat.v1.placeholder(tf.bool, name='is_train')

        # input image batch
        self.inputImgs = tf.compat.v1.placeholder(tf.float32,
                                                  shape=(None,
                                                         Model.imgSize[0],
                                                         Model.imgSize[1]))

        # setup CNN, RNN and CTC
        self.setupCNN()
        self.setupRNN()
        self.setupCTC()

        # setup optimizer to train NN
        self.batchesTrained = 0
        self.update_ops = tf.compat.v1.get_collection(
            tf.compat.v1.GraphKeys.UPDATE_OPS)
        with tf.control_dependencies(self.update_ops):
            self.optimizer = tf.compat.v1.train.AdamOptimizer().minimize(
                self.loss)

        # initialize TF
        (self.sess, self.saver) = self.setupTF()

    def setupCNN(self):
        "create CNN layers and return output of these layers"
        cnnIn4d = tf.expand_dims(input=self.inputImgs, axis=3)

        # list of parameters for the layers
        kernelVals = [5, 5, 3, 3, 3]
        featureVals = [1, 32, 64, 128, 128, 256]
        strideVals = poolVals = [(2, 2), (2, 2), (1, 2), (1, 2), (1, 2)]
        numLayers = len(strideVals)

        # create layers
        pool = cnnIn4d  # input to first CNN layer
        for i in range(numLayers):
            kernel = tf.Variable(
                tf.random.truncated_normal([
                    kernelVals[i], kernelVals[i], featureVals[i],
                    featureVals[i + 1]
                ],
                                           stddev=0.1))
            conv = tf.nn.conv2d(input=pool,
                                filters=kernel,
                                padding='SAME',
                                strides=(1, 1, 1, 1))
            conv_norm = tf.compat.v1.layers.batch_normalization(
                conv, training=self.is_train)
            relu = tf.nn.relu(conv_norm)
            pool = tf.nn.max_pool2d(input=relu,
                                    ksize=(1, poolVals[i][0], poolVals[i][1],
                                           1),
                                    strides=(1, strideVals[i][0],
                                             strideVals[i][1], 1),
                                    padding='VALID')

        self.cnnOut4d = pool

    def setupRNN(self):
        "create RNN layers and return output of these layers"
        rnnIn3d = tf.squeeze(self.cnnOut4d, axis=[2])

        # basic cells which is used to build RNN
        numHidden = 256
        cells = [
            tf.compat.v1.nn.rnn_cell.LSTMCell(num_units=numHidden,
                                              state_is_tuple=True)
            for _ in range(2)
        ]  # 2 layers

        # stack basic cells
        stacked = tf.compat.v1.nn.rnn_cell.MultiRNNCell(cells,
                                                        state_is_tuple=True)

        # bidirectional RNN
        # BxTxF -> BxTx2H
        ((fw, bw),
         _) = tf.compat.v1.nn.bidirectional_dynamic_rnn(cell_fw=stacked,
                                                        cell_bw=stacked,
                                                        inputs=rnnIn3d,
                                                        dtype=rnnIn3d.dtype)

        # BxTxH + BxTxH -> BxTx2H -> BxTx1X2H
        concat = tf.expand_dims(tf.concat([fw, bw], 2), 2)

        # project output to chars (including blank): BxTx1x2H -> BxTx1xC -> BxTxC
        kernel = tf.Variable(
            tf.random.truncated_normal(
                [1, 1, numHidden * 2,
                 len(self.charList) + 1], stddev=0.1))
        self.rnnOut3d = tf.squeeze(tf.nn.atrous_conv2d(value=concat,
                                                       filters=kernel,
                                                       rate=1,
                                                       padding='SAME'),
                                   axis=[2])

    def setupCTC(self):
        "create CTC loss and decoder and return them"
        # BxTxC -> TxBxC
        self.ctcIn3dTBC = tf.transpose(a=self.rnnOut3d, perm=[1, 0, 2])
        # ground truth text as sparse tensor
        self.gtTexts = tf.SparseTensor(
            tf.compat.v1.placeholder(tf.int64, shape=[None, 2]),
            tf.compat.v1.placeholder(tf.int32, [None]),
            tf.compat.v1.placeholder(tf.int64, [2]))

        # calc loss for batch
        self.seqLen = tf.compat.v1.placeholder(tf.int32, [None])
        self.loss = tf.reduce_mean(
            input_tensor=tf.compat.v1.nn.ctc_loss(labels=self.gtTexts,
                                                  inputs=self.ctcIn3dTBC,
                                                  sequence_length=self.seqLen,
                                                  ctc_merge_repeated=True))

        # calc loss for each element to compute label probability
        self.savedCtcInput = tf.compat.v1.placeholder(
            tf.float32, shape=[Model.maxTextLen, None,
                               len(self.charList) + 1])
        self.lossPerElement = tf.compat.v1.nn.ctc_loss(
            labels=self.gtTexts,
            inputs=self.savedCtcInput,
            sequence_length=self.seqLen,
            ctc_merge_repeated=True)

        # best path decoding or beam search decoding
        if self.decoderType == DecoderType.BestPath:
            self.decoder = tf.nn.ctc_greedy_decoder(
                inputs=self.ctcIn3dTBC, sequence_length=self.seqLen)
        elif self.decoderType == DecoderType.BeamSearch:
            self.decoder = tf.nn.ctc_beam_search_decoder(
                inputs=self.ctcIn3dTBC,
                sequence_length=self.seqLen,
                beam_width=50)
        # word beam search decoding (see https://github.com/githubharald/CTCWordBeamSearch)
        elif self.decoderType == DecoderType.WordBeamSearch:
            # prepare information about language (dictionary, characters in dataset, characters forming words)
            chars = str().join(self.charList)
            wordChars = open(
                '../model/wordCharList.txt').read().splitlines()[0]
            corpus = open('../data/corpus.txt').read()

            # decode using the "Words" mode of word beam search
            from word_beam_search import WordBeamSearch
            self.decoder = WordBeamSearch(50, 'Words', 0.0,
                                          corpus.encode('utf8'),
                                          chars.encode('utf8'),
                                          wordChars.encode('utf8'))

            # the input to the decoder must have softmax already applied
            self.wbsInput = tf.nn.softmax(self.ctcIn3dTBC, axis=2)

    def setupTF(self):
        "initialize TF"
        print('Python: ' + sys.version)
        print('Tensorflow: ' + tf.__version__)

        sess = tf.compat.v1.Session()  # TF session

        saver = tf.compat.v1.train.Saver(
            max_to_keep=1)  # saver saves model to file
        modelDir = '../model/'
        latestSnapshot = tf.train.latest_checkpoint(
            modelDir)  # is there a saved model?

        # if model must be restored (for inference), there must be a snapshot
        if self.mustRestore and not latestSnapshot:
            raise Exception('No saved model found in: ' + modelDir)

        # load saved model if available
        if latestSnapshot:
            print('Init with stored values from ' + latestSnapshot)
            saver.restore(sess, latestSnapshot)
        else:
            print('Init with new values')
            sess.run(tf.compat.v1.global_variables_initializer())

        return (sess, saver)

    def toSparse(self, texts):
        "put ground truth texts into sparse tensor for ctc_loss"
        indices = []
        values = []
        shape = [len(texts), 0]  # last entry must be max(labelList[i])

        # go over all texts
        for (batchElement, text) in enumerate(texts):
            # convert to string of label (i.e. class-ids)
            labelStr = [self.charList.index(c) for c in text]
            # sparse tensor must have size of max. label-string
            if len(labelStr) > shape[1]:
                shape[1] = len(labelStr)
            # put each label into sparse tensor
            for (i, label) in enumerate(labelStr):
                indices.append([batchElement, i])
                values.append(label)

        return (indices, values, shape)

    def decoderOutputToText(self, ctcOutput, batchSize):
        "extract texts from output of CTC decoder"

        # word beam search: already contains label strings
        if self.decoderType == DecoderType.WordBeamSearch:
            labelStrs = ctcOutput

        # TF decoders: label strings are contained in sparse tensor
        else:
            # ctc returns tuple, first element is SparseTensor
            decoded = ctcOutput[0][0]

            # contains string of labels for each batch element
            labelStrs = [[] for _ in range(batchSize)]

            # go over all indices and save mapping: batch -> values
            for (idx, idx2d) in enumerate(decoded.indices):
                label = decoded.values[idx]
                batchElement = idx2d[0]  # index according to [b,t]
                labelStrs[batchElement].append(label)

        # map labels to chars for all batch elements
        return [
            str().join([self.charList[c] for c in labelStr])
            for labelStr in labelStrs
        ]

    def trainBatch(self, batch):
        "feed a batch into the NN to train it"
        numBatchElements = len(batch.imgs)
        sparse = self.toSparse(batch.gtTexts)
        evalList = [self.optimizer, self.loss]
        feedDict = {
            self.inputImgs: batch.imgs,
            self.gtTexts: sparse,
            self.seqLen: [Model.maxTextLen] * numBatchElements,
            self.is_train: True
        }
        _, lossVal = self.sess.run(evalList, feedDict)
        self.batchesTrained += 1
        return lossVal

    def dumpNNOutput(self, rnnOutput):
        "dump the output of the NN to CSV file(s)"
        dumpDir = '../dump/'
        if not os.path.isdir(dumpDir):
            os.mkdir(dumpDir)

        # iterate over all batch elements and create a CSV file for each one
        maxT, maxB, maxC = rnnOutput.shape
        for b in range(maxB):
            csv = ''
            for t in range(maxT):
                for c in range(maxC):
                    csv += str(rnnOutput[t, b, c]) + ';'
                csv += '\n'
            fn = dumpDir + 'rnnOutput_' + str(b) + '.csv'
            print('Write dump of NN to file: ' + fn)
            with open(fn, 'w') as f:
                f.write(csv)

    def inferBatch(self, batch, calcProbability=False, probabilityOfGT=False):
        "feed a batch into the NN to recognize the texts"

        # decode, optionally save RNN output
        numBatchElements = len(batch.imgs)

        # put tensors to be evaluated into list
        evalList = []

        if self.decoderType == DecoderType.WordBeamSearch:
            evalList.append(self.wbsInput)
        else:
            evalList.append(self.decoder)

        if self.dump or calcProbability:
            evalList.append(self.ctcIn3dTBC)

        # dict containing all tensor fed into the model
        feedDict = {
            self.inputImgs: batch.imgs,
            self.seqLen: [Model.maxTextLen] * numBatchElements,
            self.is_train: False
        }

        # evaluate model
        evalRes = self.sess.run(evalList, feedDict)

        # TF decoders: decoding already done in TF graph
        if self.decoderType != DecoderType.WordBeamSearch:
            decoded = evalRes[0]
        # word beam search decoder: decoding is done in C++ function compute()
        else:
            decoded = self.decoder.compute(evalRes[0])

        # map labels (numbers) to character string
        texts = self.decoderOutputToText(decoded, numBatchElements)
        # feed RNN output and recognized text into CTC loss to compute labeling probability
        probs = None
        if calcProbability:
            sparse = self.toSparse(
                batch.gtTexts) if probabilityOfGT else self.toSparse(texts)
            ctcInput = evalRes[1]
            evalList = self.lossPerElement
            feedDict = {
                self.savedCtcInput: ctcInput,
                self.gtTexts: sparse,
                self.seqLen: [Model.maxTextLen] * numBatchElements,
                self.is_train: False
            }
            lossVals = self.sess.run(evalList, feedDict)
            probs = np.exp(-lossVals)

        # dump the output of the NN to CSV file(s)
        if self.dump:
            self.dumpNNOutput(evalRes[1])

        return texts, probs

    def save(self):
        "save model to file"
        self.snapID += 1
        self.saver.save(self.sess,
                        '../model/snapshot',
                        global_step=self.snapID)