Beispiel #1
0
 def __init__(self, config, source):
     self.config = config
     self.source = source
     self.secondSource = source.clone()
     self.secondSource.shuffleDocuments()
     self.random = numpy.random.RandomState(seed=self.getSeed())
     self.vocab = Vocab(config)
    def getOrLoadModel(self):
        import os

        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(
            self.checkpointer.getModelDirectory()) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.load()
Beispiel #3
0
    def getOrLoadModel(self):
        """Returns a linear model.

        If specified, create a new model else load an already existing model.
        """
        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory(
        )) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.loadModel()
Beispiel #4
0
    def maskOffTokens(self, labels):
        inputs = list(labels)

        for i in range(1, len(labels)):
            if self.random.binomial(1, 0.15):
                if self.random.binomial(1, 0.8):
                    inputs[i] = Vocab.getMaskToken()
                else:
                    if self.random.binomial(1, 0.5):
                        inputs[i] = self.random.randint(Vocab.getVocabOffset(),
                            self.vocab.getSize())

        inputs[0] = Vocab.getClassLabelToken()

        return inputs
class PerTokenEvaluator:
    def __init__(self, config):
        self.config = config
        self.vocab = Vocab(config)

    def initialize(self):
        self.entropy = 0.0
        self.totalBytes = 0

    def evaluate(self, inputs, labels, predictions):
        import math

        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        for batch in range(batchSize):
            for token in range(sequenceLength):
                p = predictions[batch, token, 0]
                tokenBytes = self.vocab.getTokenBytes(token)

                self.entropy += (-math.log(p)) / tokenBytes
                self.totalBytes += tokenBytes

    def getRequestedPredictions(self, inputs, labels):
        import numpy
        return numpy.expand_dims(labels, axis=2)

    def finalize(self):
        return 2**(self.entropy / self.totalBytes)
Beispiel #6
0
def saveVocab(dataset, size, directory):
    import os
    import time

    vocab = createInitialVocab()

    if os.path.isdir(directory):
        outputPath = os.path.join(directory, "vocab.txt")

        if not os.path.exists(directory):
            os.makedirs(directory)
    else:
        outputPath = directory

    previousVocabSize = 0

    start = time.time()
    totalTokens = 0

    while True:
        string = dataset.next()
        if len(string) == 0:
            break
        if not string in vocab:
            vocab[string] = 0

        totalTokens += 1
        vocab[string] += 1

        if len(vocab) + Vocab.getVocabOffset(
        ) >= previousVocabSize + size * 0.01:
            previousVocabSize = len(vocab) + Vocab.getVocabOffset()
            logger.debug("Vocab size is " + str(previousVocabSize) +
                         " time so far: " + str(time.time() - start) +
                         " total tokens: " + str(totalTokens))

        if len(vocab) + Vocab.getVocabOffset() >= size:
            break

    with open(outputPath, "w", encoding='utf-8') as outputFile:
        for token, count in reversed(sorted(vocab.items(),
                                            key=lambda x: x[1])):
            if token[-1] != '\n':
                token += '\n'
            outputFile.write(token)
Beispiel #7
0
    def next(self):
        tokenString = self.source.next()

        if self.vocab.contains(tokenString):
            token = self.vocab.getToken(tokenString)
        else:
            token = Vocab.getUnkToken()

        return token
class RandomModel:
    def __init__(self, config):
        self.config = config
        self.vocab = Vocab(config)

    def train(self):
        # no training happens in this model
        pass

    def predict(self, inputs):
        # output is [batch-size, sequence-length, vocab-size] of 1.0/vocab-size
        batchSize = inputs.shape[0]
        sequenceLength = inputs.shape[1]
        vocabSize = self.getVocabSize()

        return numpy.full([batchSize, sequenceLength, vocabSize],
                          1.0 / vocabSize)

    def getVocabSize(self):
        return self.vocab.getSize()

    def getVocab(self):
        return self.vocab
Beispiel #9
0
class ClassTransformerModel:
    def __init__(self, config, trainingDataSource, validationDataSource):
        """Initializes the model.

        Attributes:
            config: The configuration for the model.
            trainingDataSource: list of training samples and labels
            validationDataSource: list of validation samples and labels

        """
        self.config = config
        self.trainingDataSource = trainingDataSource
        self.validationDataSource = validationDataSource
        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)
        self.checkpointer = ModelDescriptionCheckpointer(config, self.__class__.__name__)
        self.isLoaded = False
        self.bestValidationLoss = None

    def train(self):
        """Trains the model.

        Trains the model for epochs specified in the config.
        Runs the validation dataset on the model if specified in the config.
        """
        with self.graph.as_default():
            self.getOrLoadModel()

        for epoch in range(self.getEpochs()):
            self.runOnTrainingDataset(epoch)

            if self.shouldRunValidation():
                self.runOnValidationDataset(epoch)
                self.validationDataSource.reset()

            self.checkpointBestModel()
            self.trainingDataSource.reset()

    def checkpointBestModel(self):
        if self.bestValidationLoss is None:
            self.checkpoint("best")
            return

        if self.totalLoss < self.bestValidationLoss:
            logger.info("Updating best model with loss: " + str(self.totalLoss))
            self.bestValidationLoss = self.totalLoss
            self.checkpoint("best")
        else:
            self.checkpoint("checkpoint")


    def predict(self, inputs, requestedPredictions):
        with self.graph.as_default():
            self.getOrLoadModel()

        assert False, "Not Implemented"

        inputs = numpy.array(inputs)

        predictions = self.session.run(self.outputProbabilities,
                feed_dict={self.inputTokens : inputs})

        batchSize = requestedPredictions.shape[0]
        length = requestedPredictions.shape[1]

        outputPredictions = numpy.zeros(requestedPredictions.shape)

        for b in range(batchSize):
            for l in range(length):
                outputPredictions[b,l,:] = \
                    predictions[b,l,requestedPredictions[b,l,:]]
        return outputPredictions

    def getFeatures(self, inputs, secondInputs):
        with self.graph.as_default():
            self.getOrLoadModel()

        inputs = numpy.expand_dims(numpy.array(inputs), axis=2)
        secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2)
        inputs = numpy.concatenate([inputs, secondInputs], axis=2)

        # (batch, sequence, 2, embedding-size)
        predictions = self.session.run(self.features,
                feed_dict={self.inputTokens : inputs})

        return predictions[:, :, 0, :]


    def getOrLoadModel(self):
        """Returns a linear model.

        If specified, create a new model else load an already existing model.
        """
        if self.isLoaded:
            return

        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(
            self.checkpointer.getModelLoadDirectory()) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.loadModel()

        self.logModel()

    def logModel(self):
        totalParameters = 0
        for variable in tf.trainable_variables():
            shape = variable.get_shape()
            variableParameters = 1
            for dim in shape:
                variableParameters *= dim.value
            totalParameters += variableParameters
            logger.debug("Variable '" + variable.name + "' " +
                str(humanize.naturalsize(variableParameters)) + " (params) " +
                str(shape) + " (dims)")

        logger.debug("Total #params '" + str(humanize.naturalsize(totalParameters)) + "' ")

    def loadModel(self):
        """Loads an already existing model from the specified path """

        self.checkpointer.load()

        directory = self.checkpointer.getModelLoadDirectory()

        logger.debug("Loading checkpoint from: " + str(directory))

        tf.saved_model.loader.load(
            self.session,
            ["serve"],
            directory
        )

        self.setOperationsByName()

        self.isLoaded = True

    def setOperationsByName(self):
        self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0")
        self.labels = self.graph.get_tensor_by_name("output-labels:0")
        self.features = self.graph.get_tensor_by_name("features:0")
        self.vocabLoss = self.graph.get_tensor_by_name("vocab-loss:0")
        self.classificationLoss = self.graph.get_tensor_by_name("classification-loss:0")
        self.classLoss = self.graph.get_tensor_by_name("class-loss:0")
        self.outputProbabilities = self.graph.get_tensor_by_name("output-probabilities:0")
        self.outputDocumentClass = self.graph.get_tensor_by_name("output-document-class:0")
        self.loss = self.graph.get_tensor_by_name("loss:0")
        self.optimizerStep = self.graph.get_operation_by_name("optimizer-step")

    def createModel(self):
        # inputs (batch, sequence-length, 2)
        self.inputTokens = tf.placeholder(tf.int32, shape=(None, None, 2),
                name="input-tokens")

        # labels (batch, sequence-length, 2)
        self.labels = tf.placeholder(tf.int32, shape=(None, None, 2),
                name="output-labels")

        self.createClassMappings()

        # convert to classes (batch, sequence-length, 2, assignments)
        self.inputClasses = self.convertToClasses(self.inputTokens)
        self.classLabels  = self.convertToClasses(self.labels)

        # class logits (batch, sequence-length, 2, assignmets, class-size)
        classLogits = self.runClassModel(self.inputClasses)

        # classification logits (batch, sequence-length, 2, assignments, 2)
        classificationLogits = self.runClassificationModel()

        # document classification logits (batch, sequence-length, 2, assignments, 2)
        documentClassificationLogits = self.runDocumentClassificationModel()

        # compute the losses
        self.clusterLoss = tf.identity(self.evaluateClusteringLoss(
            self.features, self.classLabels), name="clustering-loss")
        self.classificationLoss = tf.identity(self.evaluateClassificationLoss(
            classificationLogits, self.classLabels), name="classification-loss")
        self.documentClassificationLoss = tf.identity(self.evaluateDocumentClassificationLoss(
            documentClassificationLogits, self.classLabels), name="document-classification-loss")

        self.classLoss = tf.identity(self.evaluateLoss(classLogits[:, 1:, :, :, :],
            self.classLabels[:, 1:, :, :]), name="class-loss")
        self.vocabLoss = tf.identity(self.evaluateVocabLoss(classLogits[:, 1:, :, :, :],
            self.labels[:, 1:, :]), name="vocab-loss")

        self.loss = tf.identity(self.classLoss +
            self.classificationLoss +
            self.clusterLoss +
            self.vocabLoss,
            name="loss")

        # convert to vocab logits (batch, sequence-length, vocab-size)
        vocabLogits = self.expandClassLogitsToVocab(classLogits)

        self.outputProbabilities = tf.nn.softmax(vocabLogits,
                name="output-probabilities")

        self.outputDocumentClass = tf.reduce_max(documentClassificationLogits, axis=3)

        # optimizer
        self.optimizerStep = self.createOptimizerStep(self.loss, "")
        self.documentOptimizerStep = self.createOptimizerStep(self.documentClassificationLoss,
            "document")

        # initializers
        self.globalInitializer = tf.global_variables_initializer()
        self.localInitializer  = tf.local_variables_initializer()

        # summaries
        self.setupSummaries()

        # do the initialization
        self.initializeModel()

    def createClassMappings(self):

        mappings = numpy.zeros([self.getAssignmentCount(), self.vocab.getSize()],
            dtype=numpy.int32)
        weights = numpy.zeros([self.getAssignmentCount(), self.vocab.getSize()],
            dtype=numpy.float32)

        for assignment in range(self.getAssignmentCount()):
            mappings[assignment, :], weights[assignment, :] = self.createMapping(assignment)

        self.classMappingsHost = mappings
        self.classMappings = tf.constant(mappings)
        self.classWeights  = tf.constant(weights)

    def logAdd(self, left, right):

        if left is None:
            return right

        if left == float("-inf"):
            return right
        if right == float("-inf"):
            return left

        return max(left, right) + math.log1p(math.exp( -math.fabs(left - right)))

    def logSumArray(self, array):
        from functools import reduce
        return reduce(lambda x, y : self.logAdd(x, y), array)

    def logSubtract(self, left, right):

        if left <= right:
            assert False, "log of negative number in subtraction " + str(left) + " - " + str(right)

        if right == float("-inf"):
            return left

        return left + math.log1p(-math.exp(right - left))

    def createMapping(self, assignment):

        assert self.getNumberOfDirectClasses() <= self.getNumberOfClasses()
        assert self.getNumberOfDirectClasses() <= self.vocab.getSize()

        vocabSize       = self.vocab.getSize() - self.getNumberOfDirectClasses()
        numberOfClasses = self.getNumberOfClasses() - self.getNumberOfDirectClasses()

        directMapping = numpy.arange(self.getNumberOfDirectClasses(), dtype=numpy.int32)
        directWeights = numpy.ones(self.getNumberOfDirectClasses(), dtype=numpy.float32)

        mapping, weights = self.createLogMapping(assignment, vocabSize, numberOfClasses)

        return (numpy.concatenate([directMapping, self.getNumberOfDirectClasses() + mapping]),
            numpy.concatenate([directWeights, weights]))

    def createLogMapping(self, assignment, vocabSize, numberOfClasses):

        generator = numpy.random.RandomState(seed=assignment)

        wordCounts = reversed([i * self.getWordFrequencyPowerLawExponent()
            for i in range(vocabSize)])

        wordCountsPlusRandom = [i + math.log(generator.uniform(0.0, 1000.0)) for i in wordCounts]

        logTotalCount = self.logSumArray(wordCountsPlusRandom)

        sortedWordCounts = sorted(enumerate(wordCountsPlusRandom), key=lambda x: x[1], reverse=True)

        logClassSize = logTotalCount - math.log(numberOfClasses)

        mapping = numpy.zeros([vocabSize], dtype=numpy.int32)
        weights = numpy.zeros([vocabSize], dtype=numpy.float32)

        currentClass = 0
        wordsInCurrentClass = 0
        logCurrentCount = None
        for wordIndex, logWordCount in sortedWordCounts:
            assert currentClass < numberOfClasses
            mapping[wordIndex] = currentClass

            wordsInCurrentClass += 1
            logCurrentCount = self.logAdd(logCurrentCount, logWordCount)
            if logCurrentCount >= logClassSize and currentClass + 1 != numberOfClasses:
                #print(logCurrentCount, logWordCount, currentClass, logClassSize)
                logCurrentCount = self.logSubtract(logCurrentCount, logClassSize)
                wordsInCurrentClass = 0
                currentClass += 1

        currentClass = 0
        currentClassSize = 0
        currentClassMembers = []
        for i, wordCountAndIndex in enumerate(sortedWordCounts):
            wordIndex, wordCount = wordCountAndIndex

            currentClassMembers.append(wordIndex)
            currentClassSize += 1

            # if end of current class
            if ((1 + i) == len(sortedWordCounts) or
                mapping[sortedWordCounts[1 + i][0]] != currentClass):

                for memberIndex in currentClassMembers:
                    weights[memberIndex] = 1.0 / currentClassSize

                if currentClass == 0 or i == (len(sortedWordCounts) - 1):
                    logger.info("current class " + str(currentClass) +
                        " members " + str(len(currentClassMembers)))

                currentClass += 1
                currentClassSize = 0
                currentClassMembers = []

        return mapping, weights

    def initializeModel(self):
        self.session.run(self.globalInitializer)
        self.session.run(self.localInitializer)

    def runOnTrainingDataset(self, epoch):
        """Trains the linear model on the training dataset for one epoch."""
        trainStart = time.time()

        totalLoss = 0.0
        message = None

        for step in range(self.getStepsPerEpoch()):
            generatorStart = time.time()

            try:
                inputs, labels, secondInputs, secondLabels = self.trainingDataSource.next()
            except Exception as e:
                if message is None:
                    message = str(e)
                break

            generatorEnd = time.time()

            trainStepStart = time.time()
            loss, gradNorm = self.trainingStep(inputs, labels, secondInputs, secondLabels,
                                               step, epoch)
            trainStepEnd = time.time()

            totalLoss += loss

            message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) +
                "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) +
                "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) +
                ", training step time: " + ("%.2f" % (trainStepEnd -
                    trainStepStart) +
                ", loss: " + str("%.2f" % loss) +
                ", grad norm: " + str("%.2f" % gradNorm)) +
                ", avg-loss: " + str("%.2f" % (totalLoss / (step + 1))))

            print(message, end="\r", flush=True)

        trainEnd = time.time()

        print(message)
        logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...")

    def trainingStep(self, inputs, labels, secondInputs, secondLabels, step, epoch):
        """Training step for one minibatch of training data."""
        inputs = numpy.expand_dims(numpy.array(inputs), axis=2)
        labels = numpy.expand_dims(numpy.array(labels), axis=2)
        secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2)
        secondLabels = numpy.expand_dims(numpy.array(secondLabels), axis=2)

        inputs = numpy.concatenate([inputs, secondInputs], axis=2)
        labels = numpy.concatenate([labels, secondLabels], axis=2)

        if self.getShouldClassifyDocument():
            optimizerStep = self.documentOptimizerStep
            loss = self.documentClassificationLoss
        else:
            optimizerStep = self.optimizerStep
            loss = self.loss

        trainingLoss, gradNorm, summaries, _ = self.session.run([loss,
            self.gradientNorm, self.mergedSummary, optimizerStep],
            feed_dict={self.inputTokens : inputs, self.labels : labels })

        if step % self.getStepsPerTensorboardLog():
            self.trainingSummaryWriter.add_summary(summaries,
                step + epoch * self.getStepsPerEpoch())
        return trainingLoss, gradNorm

    def runOnValidationDataset(self, epoch):
        """Runs the linear model on the validation dataset for one epoch."""

        validationStart = time.time()

        self.totalLoss = 0.0
        self.totalVocabLoss = 0.0

        message = None

        for step in range(self.getValidationStepsPerEpoch()):
            generatorStart = time.time()

            try:
                inputs, labels, secondInputs, secondLabels = self.validationDataSource.next()
            except Exception as e:
                if message is None:
                    message = str(e)
                break

            generatorEnd = time.time()

            validationStepStart = time.time()
            loss, vocabLoss = self.validationStep(inputs, labels, secondInputs, secondLabels)
            validationStepEnd = time.time()

            self.totalLoss += loss
            self.totalVocabLoss += vocabLoss

            message = ("Validation Step (" + str(step) + " / " +
                    str(self.getValidationStepsPerEpoch()) +
                "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) +
                ", validation step time: " + ("%.2f" % (validationStepEnd - validationStepStart)) +
                ", avg-loss: " + ("%.2f" % (self.totalLoss/(step + 1))))

            print(message, end="\r", flush=True)

        validationEnd = time.time()

        print(message)
        logger.debug(" Validation took: " + (str(validationEnd - validationStart)) + " seconds...")

        self.addValidationSummaries(self.totalLoss, self.totalVocabLoss, epoch)

    def addValidationSummaries(self, totalLoss, vocabLoss, epoch):

        averageLoss = totalLoss / self.getValidationStepsPerEpoch()

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="validation-loss", simple_value=averageLoss),
        ])

        self.trainingSummaryWriter.add_summary(summary, epoch)

        averageVocabLoss = vocabLoss / self.getValidationStepsPerEpoch()

        summary = tf.Summary(value=[
            tf.Summary.Value(tag="validation-vocab-cross-entropy", simple_value=averageVocabLoss),
        ])

        self.trainingSummaryWriter.add_summary(summary, epoch)

    def validationStep(self, inputs, labels, secondInputs, secondLabels):
        """One minibatch of validation data processed by the model."""

        inputs = numpy.expand_dims(numpy.array(inputs), axis=2)
        labels = numpy.expand_dims(numpy.array(labels), axis=2)
        secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2)
        secondLabels = numpy.expand_dims(numpy.array(secondLabels), axis=2)

        inputs = numpy.concatenate([inputs, secondInputs], axis=2)
        labels = numpy.concatenate([labels, secondLabels], axis=2)

        if self.getShouldClassifyDocument():
            loss = self.documentClassificationLoss
        else:
            loss = self.loss

        validationLoss, vocabLoss = self.session.run([loss, self.vocabLoss],
                feed_dict={self.inputTokens : inputs,
                self.labels : labels})
        return validationLoss, vocabLoss

    def createOptimizerStep(self, loss, name):
        """One step of backprop."""

        optimizer = tf.train.AdamOptimizer(
            learning_rate=float(self.config["model"]["learning-rate"]),
            beta1=0.9,
            beta2=0.98,
            epsilon=10e-9,
            name=name+"optimizer-step")

        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(gradients,
        self.config["model"]["gradient-clipping-factor"])
        self.gradientNorm = tf.global_norm(gradients, name="gradient-norm")

        return optimizer.apply_gradients(zip(gradients, variables))

    def setupSummaries(self):
        tf.summary.scalar('total-loss', self.loss)
        if self.getShouldClassifyDocument():
            tf.summary.scalar('document-class-cross-entropy', self.documentClassificationLoss)
        else:
            tf.summary.scalar('document-match-cross-entropy', self.classificationLoss)
            tf.summary.scalar('vocab-cross-entropy', self.vocabLoss)
            tf.summary.scalar('class-cross-entropy', self.classLoss)
            tf.summary.scalar('cluster-loss', self.clusterLoss)
        tf.summary.scalar('gradient-norm', self.gradientNorm)

        self.mergedSummary = tf.summary.merge_all()

        self.trainingSummaryWriter = tf.summary.FileWriter(
            os.path.join(self.getExperimentDirectory(), 'training-summaries'),
            self.graph)

        #if self.shouldRunValidation():
        #    self.validationSummaryWriter = tf.summary.FileWriter(
        #        os.path.join(self.getExperimentDirectory(), 'validation-summaries'),
        #        self.graph)

    def evaluateClusteringLoss(self, features, classLabels):
        # features is [batch, sequence, 2, assignments, feature-dimension]
        # class labels is [batch, sequence, 2, assignments]
        assignmentLosses = []

        batchSize = tf.shape(features)[0]
        sequenceLength = tf.shape(features)[1]

        features = tf.reshape(self.features, (batchSize, sequenceLength,
            2, self.getAssignmentCount(), self.getEmbeddingSize()))

        for i in range(self.getAssignmentCount()):
            assignmentLosses.append(self.evaluatePerAssignmentClusterLoss(
                features[:, :, :, i, :], classLabels[:, :, :, i]))

        return sum(assignmentLosses) / (tf.multiply(tf.cast(batchSize, dtype=tf.float32),
            2.0 * self.getAssignmentCount()))

    def evaluatePerAssignmentClusterLoss(self, features, labels):
        # features is [batch, sequence, 2, feature-dim]
        # labels is [batch, sequence, 2]
        wordFeatures = tf.reshape(features[:, 0, :, :], (-1, self.getEmbeddingSize()))
        tripletLabels = tf.reshape(labels[:, 0, :], (-1, ))

        return self.tripletLoss(wordFeatures, tripletLabels)

    def tripletLoss(self, features, labels):
        return tf.contrib.losses.metric_learning.triplet_semihard_loss(labels, features)

    def evaluateClassificationLoss(self, batchOutputs, labels):
        # batch outputs is [batch, assignments, 2]
        # labels is [batch, sequence, 2, assignments, 1]
        labels = tf.cast(tf.equal(labels[:, 0, 0, :, 0], labels[:, 0, 1, :, 0]), tf.int32)
        return tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=batchOutputs)

    def evaluateDocumentClassificationLoss(self, batchOutputs, labels):
        # batch outputs is [batch, 2, assignments, 2]
        # labels is [batch, sequence, 2, assignments, 1]
        labels = labels[:,0,:,:,0]
        return tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=batchOutputs)

    def evaluateLoss(self, batchOutputs, labels):
        return tf.losses.sparse_softmax_cross_entropy(
            labels=labels,
            logits=batchOutputs)

    def klDivergence(self, a, b):
        a = tf.distributions.Categorical(probs=a + numpy.finfo(float).eps)
        b = tf.distributions.Categorical(probs=tf.nn.softmax(b) + numpy.finfo(float).eps)
        return tf.reduce_mean(tf.distributions.kl_divergence(a, b, allow_nan_stats=False))

    def convertToClasses(self, inputs):
        # inputs is (batch, sequence, 2)
        # class mappings is (assignments, vocab size)
        # outputs is (batch, sequence, 2, assignments)
        batchSize      = tf.shape(inputs)[0]
        sequenceLength = tf.shape(inputs)[1]

        classes = tf.concat([tf.reshape(tf.gather(self.classMappings[i, :], inputs),
                                        (batchSize, sequenceLength, 2, 1))
            for i in range(self.getAssignmentCount())], axis=3)

        return tf.reshape(classes, (batchSize, sequenceLength, 2, self.getAssignmentCount(), 1))

    def expandClassLogitsToVocab(self, classLogits):
        # class logits is (batch size, sequence-length, 2, assignments, class-size)
        # class mappings is (class-assignments, vocab-size)
        # class weights is (class-assignments, vocab-size)
        # output is (batch-size, sequence-length, 2, vocab-size)
        batchSize      = tf.shape(classLogits)[0]
        sequenceLength = tf.shape(classLogits)[1]

        gatheredLogits = tf.concat([tf.reshape(tf.gather(classLogits[:,:,:,i,:], self.classMappings[i, :], axis=3),
                                    (batchSize, sequenceLength, 2, 1, self.vocab.getSize()))
            for i in range(self.getAssignmentCount())], axis=3)

        return tf.reduce_mean(tf.multiply(gatheredLogits, self.classWeights), axis=3)

    def evaluateVocabLoss(self, classLogits, vocabLabels):
        # labels is (batch size, sequence-length, 2)

        batchSize      = tf.shape(classLogits)[0]
        sequenceLength = tf.shape(classLogits)[1]

        sampleCount = self.getSoftmaxSampleCount()
        samples = self.generateSamples(sampleCount)
        sampledLabels = tf.zeros((batchSize, sequenceLength, 2), dtype=tf.int32)

        # sampled mappings is (assignment count, sample count)
        sampledMappings = self.sample(self.classMappings, samples, sampleCount)

        # sampled weights is (assignment count, sample count)
        sampledWeights = self.sample(self.classWeights, samples, sampleCount)

        # gathered logits is (batch size, sequence length, assignment count, sample count)
        gatheredLogits = tf.concat([tf.reshape(tf.gather(classLogits[:,:,:,i,:], sampledMappings[i,:], axis=3),
                                    (batchSize, sequenceLength, 2, 1, sampleCount))
            for i in range(self.getAssignmentCount())], axis=3)

        # gathered weights is (batch size, sequence length, 2, assignment count, sample count)
        gatheredWeights = self.broadcastToExpandedDimension(sampledWeights, batchSize, sequenceLength)

        # gathered logits and weights is (batch size, sequence length, 2, assignment count, sample count + 1)
        gatheredLogits  = self.extendLogits(gatheredLogits, classLogits, vocabLabels)
        gatheredWeights = self.extendWeights(gatheredWeights, vocabLabels)

        # weighted logits is (batch size, sequence length, 2, assignments, sample count + 1)
        weightedLogits = tf.multiply(gatheredLogits, gatheredWeights)

        # vocab logits is (batch size, sequence length, 2, sample count + 1)
        vocabLogits = tf.reduce_mean(weightedLogits, axis=3)

        return self.evaluateLoss(vocabLogits[:, 1:, :, :], sampledLabels[:, 1:, :])

    def generateSamples(self, sampleCount):
        samplesPerAssignment = []

        # TODO: BUG: Dont sample the label
        for assignment in range(self.getAssignmentCount()):
            samples, _, _ = tf.random.uniform_candidate_sampler(
                true_classes=tf.broadcast_to(tf.range(self.vocab.getSize(), dtype=tf.int64),
                                             (1, self.vocab.getSize())),
                num_true=self.vocab.getSize(),
                num_sampled=sampleCount,
                range_max=self.vocab.getSize(),
                unique=True)

            samplesPerAssignment.append(tf.reshape(samples, (1, -1)))

        return tf.concat(samplesPerAssignment, axis=0)

    def extendLogits(self, vocabLogits, classLogits, labels):
        # class logits is (batch size, sequence length, 2, assignment count, sample count)
        # map is (assignment count, vocab size)
        # labels is (batch size, sequence length, 2)
        batchSize      = tf.shape(classLogits)[0]
        sequenceLength = tf.shape(classLogits)[1]

        # labelClasses is (batch size, sequence length, 2, assignment count, 1)
        labelClasses = tf.concat(
            [tf.reshape(tf.gather(self.classMappings[i, :], labels),
                (batchSize, sequenceLength, 2, 1, 1)) for i in range(self.getAssignmentCount())],
            axis=3)

        # gathered logits is (batch size, sequence length, 2, assignment count, 1)
        gatheredLogits = tf.batch_gather(classLogits, labelClasses)

        return tf.concat([gatheredLogits, vocabLogits], axis=4)

    def extendWeights(self, vocabWeights, labels):
        # vocab weights is (batch size, sequence length, 2, assignment count, sample count)
        # labels is (batch size, sequence length)
        batchSize      = tf.shape(vocabWeights)[0]
        sequenceLength = tf.shape(vocabWeights)[1]

        # labelWeights is (batch size, sequence length, 2, assignment count, 1)
        labelWeights = tf.concat(
            [tf.reshape(tf.gather(self.classWeights[i, :], labels),
                (batchSize, sequenceLength, 2, 1, 1)) for i in range(self.getAssignmentCount())],
            axis=3)

        return tf.concat([labelWeights, vocabWeights], axis=4)

    def sample(self, mappings, samples, sampleCount):

        assignments = []

        for i in range(self.getAssignmentCount()):
            assignments.append(tf.reshape(tf.gather(mappings[i, :], samples[i,:]), (1, sampleCount)))

        return tf.concat(assignments, axis=0)

    def broadcastToExpandedDimension(self, tensor, batchSize, sequenceLength):
        classAssignments = tensor.shape[0]
        vocabSize = tensor.shape[1]

        newShape = (batchSize, sequenceLength, 2, classAssignments, vocabSize)

        expandedTensor = tf.broadcast_to(tensor, newShape)

        #print(expandedTensor.shape)

        reshapedTensor = tf.reshape(expandedTensor, newShape)
        #print(reshapedTensor.shape)

        return reshapedTensor

    def runClassModel(self, inputs):
        #print("inputs", inputs.shape)

        inputEmbeddings = self.convertToEmbeddings(inputs)

        #print("inputEmbeddings", inputEmbeddings.shape)

        # run encoder (logits is (batch-size, sequence-length, assignments, class-count))
        encodedEmbeddings = self.runEncoder(inputEmbeddings)

        logits = self.runDecoder(encodedEmbeddings)

        #print("logits", logits.shape)

        return logits

    def runClassificationModel(self):
        batchSize = tf.shape(self.features)[0]
        sequenceLength = tf.shape(self.features)[1]

        features = tf.reshape(self.features, (batchSize, sequenceLength, 2,
            self.getAssignmentCount(), self.getEmbeddingSize()))

        features = self.multiheadedAttention(features)

        # features is (batch-size, sequence-length, 2, assignments, embedding-size)
        reducedFeatures = tf.reduce_max(features, axis=1)

        # reducedFeatures is (batch size, 2, assignments, embedding-size)
        transposedFeatures = tf.transpose(reducedFeatures, [0,2,1,3])

        # transposedFeatures is (batch size, assignments, 2, embedding-size)
        reshapedFeatures = tf.reshape(transposedFeatures, (-1, self.getAssignmentCount(),
            2 * self.getEmbeddingSize()))

        return tf.layers.dense(reshapedFeatures, units=2)

    def runDocumentClassificationModel(self):
        batchSize = tf.shape(self.features)[0]
        sequenceLength = tf.shape(self.features)[1]

        features = tf.reshape(self.features, (batchSize, sequenceLength, 2,
            self.getAssignmentCount(), self.getEmbeddingSize()))

        features = self.multiheadedAttention(features)

        # features is (batch-size, sequence-length, 2, assignments, embedding-size)
        reducedFeatures = tf.reduce_max(features, axis=1)

        # transposedFeatures is (batch size, assignments, 2, embedding-size)
        reshapedFeatures = tf.reshape(reducedFeatures, (-1, 2, self.getAssignmentCount(),
            self.getEmbeddingSize()))

        return tf.layers.dense(reshapedFeatures, units=2)

    def convertToEmbeddings(self, sequenceIds):
        assignments = []
        for assignment in range(self.getAssignmentCount()):
            assignments.append(self.convertToClassEmbeddings(sequenceIds, assignment))

        return tf.concat(assignments, axis = 3)

    def convertToClassEmbeddings(self, ids, assignment):

        with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE):
            wordEmbeddingsGlobal = tf.get_variable('class-embeddings-' + str(assignment), \
                    [self.getNumberOfClasses(), self.getEmbeddingSize()])

        wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal, ids[:, :, :, assignment, :])
        return wordEmbeddings

    def runEncoder(self, embeddings):
        return self.multiheadedAttentionStack(embeddings)

    def runDecoder(self, embeddings):

        batchSize      = tf.shape(embeddings)[0]
        sequenceLength = tf.shape(embeddings)[1]
        # embeddings is (batch size, sequence length, 2, assignments, classes)
        return tf.concat([tf.reshape(tf.layers.dense(embeddings[:,:,:,i,:], units=self.getNumberOfClasses()),
                (batchSize, sequenceLength, 2, 1, self.getNumberOfClasses()))
            for i in range(self.getAssignmentCount())], axis=3)

    def multiheadedAttentionStack(self, embeddings):

        embeddings = self.addPositions(embeddings)

        # embeddings (batch-size, sequence-length, 2, assignments, hidden-dimension)
        for layer in range(self.getNumberOfLayers()):
            embeddings = self.multiheadedAttention(embeddings)

            if self.isMiddleLayer(layer):
                batchSize      = tf.shape(embeddings)[0]
                sequenceLength = tf.shape(embeddings)[1]

                self.features = tf.identity(tf.reshape(embeddings, (batchSize, sequenceLength, 2,
                    self.getAssignmentCount() * self.getEmbeddingSize())), name="features")

        return embeddings

    def addPositions(self, embeddings):
        batchSize      = tf.shape(embeddings)[0]
        sequenceLength = tf.shape(embeddings)[1]

        halfSequenceLength = (sequenceLength + 1) // 2

        positions = tf.cast(tf.reshape(tf.range(halfSequenceLength),
            (1, halfSequenceLength, 1, 1, 1)), dtype=tf.float32)
        dimensions = tf.cast(tf.reshape(tf.range(self.getEmbeddingSize()),
            (1, 1, 1, 1, self.getEmbeddingSize())), dtype=tf.float32)

        angles = positions / tf.pow(2.0 * tf.cast(halfSequenceLength, dtype=tf.float32),
                                    2.0 * dimensions / self.getEmbeddingSize())

        evenPositionEmbeddings = tf.reshape(tf.sin(angles),
            (1, halfSequenceLength, 1, 1, 1, self.getEmbeddingSize()))
        oddPositionEmbeddings  = tf.reshape(tf.cos(angles),
            (1, halfSequenceLength, 1, 1, 1, self.getEmbeddingSize()))

        # merge them
        positionEmbeddings = tf.concat([evenPositionEmbeddings, oddPositionEmbeddings], axis=2)
        positionEmbeddings = tf.reshape(positionEmbeddings,
            (1, 2 * halfSequenceLength, 1, 1, self.getEmbeddingSize()))

        positionEmbeddings = positionEmbeddings[:, 0:sequenceLength, :, :, :]

        return embeddings + positionEmbeddings

    def isMiddleLayer(self, layer):
        if self.getNumberOfLayers() > 1:
            return layer == (self.getNumberOfLayers() - 2)

        return layer == (self.getNumberOfLayers() - 1)

    def multiheadedAttention(self, embeddings):
        # embeddings (batch-size, sequence-length, assignments, hidden-dimension)
        projectedEmbeddings = self.projectEmbeddings(embeddings)

        # proj-embeddings (batch-size, sequence-length, assignments, QKV, attention-heads, hidden-dimension)
        attentionOutput = self.runAttention(projectedEmbeddings)

        # project back
        outputEmbeddings = self.projectBackEmbeddings(attentionOutput)

        # add and norm
        embeddings = self.addAndNorm(outputEmbeddings, embeddings)

        # dense layer
        denseOutput = tf.layers.dense(embeddings,
            self.getEmbeddingSize(), activation="relu")

        # add and norm
        denseOutput = self.addAndNorm(denseOutput, embeddings)

        return denseOutput

    def projectEmbeddings(self, embeddings):
        output = tf.layers.dense(embeddings,
            embeddings.shape[-1] * 3 * self.getNumberOfAttentionHeads())

        batchSize      = tf.shape(embeddings)[0]
        sequenceLength = tf.shape(embeddings)[1]
        assignments    = embeddings.shape[3]

        return tf.reshape(output,
            (batchSize, sequenceLength, 2, assignments, 3,
             self.getNumberOfAttentionHeads(), embeddings.shape[-1]))

    def projectBackEmbeddings(self, embeddings):
        # embeddings are (batch-size, sequence-length, 2, assignments, attention-heads, embedding-size)
        # project to (batch-size, sequece-length, 2, assignments, embedding-size)

        batchSize      = tf.shape(embeddings)[0]
        sequenceLength = tf.shape(embeddings)[1]
        assignments    = embeddings.shape[3]

        reshapedEmbeddings = tf.reshape(embeddings, (batchSize, sequenceLength, 2, assignments,
            embeddings.shape[-1] * embeddings.shape[-2]))

        projectedEmbeddings = tf.layers.dense(reshapedEmbeddings, self.getEmbeddingSize())

        return projectedEmbeddings

    def addAndNorm(self, left, right):
        return tf.contrib.layers.layer_norm(tf.add(left, right))

    def runAttention(self, embeddings):
        # Q,K,V (batch-size, sequence-length, 2, assignments, attention-heads, hidden-dimension)
        Q = embeddings[:,:,:,:,0,:,:]
        K = embeddings[:,:,:,:,1,:,:]
        V = embeddings[:,:,:,:,2,:,:]

        readOn = tf.matmul(Q, K, transpose_b=True)

        scale = math.sqrt(self.getEmbeddingSize())

        scaledReadOn = readOn / scale

        contribution = tf.nn.softmax(scaledReadOn, axis=1)

        result = tf.matmul(contribution, V)

        return result

    def checkpoint(self, prefix):
        """Creates a checkpoint of the current model and saves to model
        directory.
        """

        self.checkpointer.setPrefix(prefix)
        directory = self.checkpointer.getModelSaveDirectory()
        logger.debug("Saving checkpoint to: " + str(directory))

        self.checkpointer.checkpoint()

        with self.graph.as_default():
            tf.saved_model.simple_save(self.session,
                directory,
                inputs={"input_text" : self.inputTokens},
                outputs={"outputs" : self.outputDocumentClass})

        self.checkpointer.cleanup()


    """Functions to load configuration parameters."""
    def getEmbeddingSize(self):
        return int(self.config["model"]["embedding-size"])

    def getAssignmentCount(self):
        return int(self.config["model"]["assignment-count"])

    def getSoftmaxSampleCount(self):
        return int(self.config["model"]["softmax-sample-count"])

    def getNumberOfClasses(self):
        return int(self.config["model"]["number-of-classes"])

    def getNumberOfDirectClasses(self):
        return int(self.config["model"]["number-of-direct-classes"])

    def getNumberOfLayers(self):
        return int(self.config["model"]["number-of-layers"])

    def getNumberOfAttentionHeads(self):
        return int(self.config["model"]["number-of-attention-heads"])

    def getWordFrequencyPowerLawExponent(self):
        return float(self.config["model"]["word-frequency-power-law-exponent"])

    def shouldRunValidation(self):
        return self.config["model"]["run-validation"]

    def getEpochs(self):
        return int(self.config["model"]["epochs"])

    def getShouldCreateModel(self):
        if not "create-new-model" in self.config["model"]:
            return False
        return bool(self.config["model"]["create-new-model"])

    def getShouldClassifyDocument(self):
        if not "classify-document" in self.config["model"]:
            return False
        return bool(self.config["model"]["classify-document"])

    def getStepsPerEpoch(self):
        return int(self.config["model"]["steps-per-epoch"])

    def getStepsPerTensorboardLog(self):
        return int(self.config["model"]["steps-per-tensorboard-log"])

    def getValidationStepsPerEpoch(self):
        return int(self.config["model"]["validation-steps-per-epoch"])

    def getExperimentDirectory(self):
        return self.config["model"]["directory"]
Beispiel #10
0
import sys
sys.path.append('source')

from models.Vocab import Vocab
from sklearn.cluster import MiniBatchKMeans as MiniBatchKMeans

import os
import numpy

directory = 'output-features-16k-classes-2-layers-200MB-3'
numberOfClusters = 16

vocab = Vocab({"model": {"vocab": os.path.join(directory, 'vocab.txt')}})

embeddings = numpy.load(os.path.join(directory, 'features.npy'))
inputs = numpy.load(os.path.join(directory, 'inputs.npy'))
labels = numpy.load(os.path.join(directory, 'labels.npy'))

chunkCount = embeddings.shape[0]
chunkLength = embeddings.shape[1]

clusters = numpy.reshape(
    KMeans(n_clusters=numberOfClusters).fit_predict(
        numpy.reshape(embeddings, (-1, embeddings.shape[-1]))),
    (chunkCount, chunkLength))

clusterMap = {i: [] for i in range(numberOfClusters)}

for chunk in range(chunkCount):
    chunkString = [
        vocab.getTokenString(labels[chunk, word])
 def __init__(self, config):
     self.config = config
     self.vocab = Vocab(config)
    def rewriteSplitTokens(self, inputs, labels, predictions):
        from functools import reduce

        newInputs = []
        newPredictions = []
        newVocabProbabilities = []

        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        # collapse expanded tokens
        for batch in range(batchSize):

            inputString = "".join([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ])
            reservedIndices = set([
                index for index, token in enumerate(labels[batch, :])
                if Vocab.isReservedToken(token)
            ])

            tokenizer = UnlimitedVocabTokenizerAdaptor(
                StringDataSource(inputString))

            completeTokens = [
                tokenizer.next() for i in range(tokenizer.size())
            ]

            logger.debug("Reformed input string: '" + str([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ]))
            logger.debug("' tokenized to: " + str(completeTokens))
            logger.debug(
                " tokens: " +
                str([self.vocab.getToken(token) for token in completeTokens]))

            index = 0
            completeTokenIndex = 0

            newBatchInputs = []
            newBatchPredictions = []
            newBatchVocabProbabilities = []

            while index < sequenceLength:
                token = labels[batch, index]
                completeToken = completeTokens[completeTokenIndex]

                # get token end
                tokenEndIndex = index + 1
                if self.vocab.getToken(
                        completeToken
                ) != token and not index in reservedIndices:
                    while tokenEndIndex < sequenceLength:
                        possibleToken = labels[batch, tokenEndIndex]
                        if (completeTokenIndex + 1) < len(completeTokens):
                            if self.vocab.getToken(
                                    completeTokens[completeTokenIndex +
                                                   1]) == possibleToken:
                                break
                        tokenEndIndex += 1

                # add token
                newBatchInputs.append([index, tokenEndIndex])
                newBatchVocabProbabilities.append(
                    list(predictions[batch, index, :]))
                newBatchVocabProbabilities[-1][0] = 0.0

                # compute new probabilities for the merged token
                predictionValues = predictions[batch, index:tokenEndIndex, 0]
                newBatchPredictions.append(
                    reduce(lambda x, y: x * y, predictionValues))

                if tokenEndIndex > (index + 1):
                    logger.debug("Reformed split tokens: " + str([
                        self.vocab.getTokenString(token)
                        for token in labels[batch, index:tokenEndIndex]
                    ]) + (" with prob: %.4f" % newBatchPredictions[-1]))

                if not index in reservedIndices:
                    completeTokenIndex += 1

                index = tokenEndIndex

            newInputs.append(newBatchInputs)
            newPredictions.append(newBatchPredictions)
            newVocabProbabilities.append(newBatchVocabProbabilities)

        # pad
        maxLength = max([len(tokens) for tokens in newInputs])

        newInputs = [
            inputs +
            [self.getPadToken() for i in range(maxLength - len(inputs))]
            for inputs in newInputs
        ]
        newPredictions = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newPredictions
        ]
        newVocabProbabilties = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newVocabProbabilities
        ]

        return numpy.array(newInputs), numpy.array(
            newPredictions), numpy.array(newVocabProbabilities)
Beispiel #13
0
def runLocally(arguments):
    import numpy

    numpy.set_printoptions(precision=3, linewidth=150)

    device = getDevice()
    with tf.device(device):
        for scope in arguments["enable_logger"]:
            logger = logging.getLogger(scope)
            logger.setLevel(logging.DEBUG)

        config = loadConfig(arguments)

        overrideConfig(config, arguments)

        if arguments["predict"]:

            if not "predictor" in config:
                config["predictor"] = {}

            validationData = getValidationData(config)
            predictor = getPredictor(config, validationData)
            perplexity = predictor.predict()

            print("Perplexity " + str(perplexity))

        elif arguments["make_clusters"]:

            validationData = getValidationData(config)
            clusterer = getClusterer(config, validationData,
                                     arguments["output_directory"],
                                     int(arguments["cluster_count"]))

            clusterer.groupDataIntoClusters()

        elif arguments["make_test_set"]:
            validationData = getValidationData(config)

            if int(arguments["test_set_size"]) > 0:
                saveData(validationData, int(arguments["test_set_size"]),
                         arguments["output_directory"], Vocab(config))
            else:
                assert int(arguments["test_set_size_bytes"]) > 0
                saveDataBytes(validationData,
                              int(arguments["test_set_size_bytes"]),
                              arguments["output_directory"], Vocab(config))

        elif arguments["make_vocab"]:
            validationData = getValidationData(config)

            saveVocab(validationData, int(arguments["vocab_size"]),
                      arguments["output_directory"])

        else:
            config["model"]["directory"] = nameDirectory(
                arguments["experiment_name"])

            makeExperiment(config)

            trainingData = getTrainingData(config)
            validationData = getValidationData(config)

            model = getModel(config, trainingData, validationData)
            model.train()
Beispiel #14
0
 def loadVocab(self):
     return Vocab(self.config)
Beispiel #15
0
class LinearModel:
    def __init__(self, config, trainingDataSource, validationDataSource):
        """Initializes the linear model object.

        Attributes:
            config: The configuration for the model.
            trainingDataSource: list of training samples and labels
            validationDataSource: list of validation samples and labels

        """
        self.config = config
        self.trainingDataSource = trainingDataSource
        self.validationDataSource = validationDataSource
        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)
        self.checkpointer = ModelDescriptionCheckpointer(config, "LinearModel")
        self.isLoaded = False

    def train(self):
        """Trains the linear model.

        Trains the model for epochs specified in the config.
        Runs the validation dataset on the model if specified in the config.
        """
        with self.graph.as_default():
            self.getOrLoadModel()

        for epoch in range(self.getEpochs()):
            self.runOnTrainingDataset(epoch)

            if self.shouldRunValidation():
                self.runOnValidationDataset(epoch)

            self.checkpoint()

    def predict(self, inputs, requestedPredictions):
        with self.graph.as_default():
            self.getOrLoadModel()

        inputs = numpy.array(inputs)

        predictions = self.session.run(self.outputProbabilities,
                                       feed_dict={self.inputTokens: inputs})

        batchSize = requestedPredictions.shape[0]
        length = requestedPredictions.shape[1]

        outputPredictions = numpy.zeros(requestedPredictions.shape)

        for b in range(batchSize):
            for l in range(length):
                outputPredictions[b,l,:] = \
                    predictions[b,l,requestedPredictions[b,l,:]]
        return outputPredictions

    def getOrLoadModel(self):
        """Returns a linear model.

        If specified, create a new model else load an already existing model.
        """
        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory(
        )) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.loadModel()

    def loadModel(self):
        """Loads an already existing model from the specified path """
        if self.isLoaded:
            return

        self.checkpointer.load()

        directory = self.checkpointer.getModelDirectory()

        logger.debug("Loading checkpoint from: " + str(directory))

        tf.saved_model.loader.load(self.session, ["serve"], directory)

        self.setOperationsByName()

        self.isLoaded = True

    def setOperationsByName(self):
        self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0")
        self.labels = self.graph.get_tensor_by_name("output-labels:0")
        self.outputProbabilities = \
            self.graph.get_tensor_by_name("output-probabilities:0")
        self.loss = self.graph.get_tensor_by_name("loss:0")
        self.optimizerStep = self.graph.get_operation_by_name("optimizer-step")

    def createModel(self):
        ## (batch, sequence-length, 1)
        self.inputTokens = tf.placeholder(tf.int32,
                                          shape=(None, None),
                                          name="input-tokens")

        self.labels = tf.placeholder(tf.int32,
                                     shape=(None, None),
                                     name="output-labels")

        predictedLogits = self.processInputMiniBatch(self.inputTokens)
        self.loss = self.evaluateLoss(predictedLogits, self.labels)
        self.outputProbabilities = tf.nn.softmax(predictedLogits,
                                                 name="output-probabilities")

        # optimizer
        self.optimizerStep = self.createOptimizerStep(self.loss)

        # initializers
        self.globalInitializer = tf.global_variables_initializer()
        self.localInitializer = tf.local_variables_initializer()

        # summaries
        self.setupSummaries()

        # do the initialization
        self.initializeModel()

    def initializeModel(self):
        self.session.run(self.globalInitializer)
        self.session.run(self.localInitializer)

    def runOnTrainingDataset(self, epoch):
        """Trains the linear model on the training dataset for one epoch."""
        trainStart = time.time()

        for step in range(self.getStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.trainingDataSource.next()

            generatorEnd = time.time()

            trainStepStart = time.time()
            loss, gradNorm = self.trainingStep(inputs, labels, step)
            trainStepEnd = time.time()

            message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) +
                       "), Step (" + str(step) + " / " +
                       str(self.getStepsPerEpoch()) + "), Generator time: " +
                       ("%.2f" % (generatorEnd - generatorStart)) +
                       ", training step time: " +
                       ("%.2f" % (trainStepEnd - trainStepStart) + ", loss: " +
                        str("%.2f" % loss) + ", grad norm: " +
                        str("%.2f" % gradNorm)))

            print(message, end="\r", flush=True)

        trainEnd = time.time()

        print(message)
        logger.debug(" Training took: " + (str(trainEnd - trainStart)) +
                     " seconds...")

    def trainingStep(self, inputs, labels, step):
        """Training step for one minibatch of training data."""
        inputs = numpy.array(inputs)
        labels = numpy.array(labels)

        trainingLoss, gradNorm, summaries, _ = self.session.run(
            [
                self.loss, self.gradientNorm, self.mergedSummary,
                self.optimizerStep
            ],
            feed_dict={
                self.inputTokens: inputs,
                self.labels: labels
            })

        self.trainingSummaryWriter.add_summary(summaries, step)
        return trainingLoss, gradNorm

    def runOnValidationDataset(self, epoch):
        """Runs the linear model on the validation dataset for one epoch."""

        validationStart = time.time()

        for step in range(self.getValidationStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.validationDataSource.next()

            generatorEnd = time.time()

            validationStepStart = time.time()
            loss, summary = self.validationStep(inputs, labels)
            validationStepEnd = time.time()

            message = ("Validation Step (" + str(step) + " / " +
                       str(self.getValidationStepsPerEpoch()) +
                       "), Generator time: " +
                       ("%.2f" % (generatorEnd - generatorStart)) +
                       ", validation step time: " +
                       ("%.2f" % (validationStepEnd - validationStepStart) +
                        ", loss: " + str(loss)))

            print(message, end="\r", flush=True)

        validationEnd = time.time()

        print(message)
        logger.debug(" Validation took: " +
                     (str(validationEnd - validationStart)) + " seconds...")

    def validationStep(self, inputs, labels):
        """One minibatch of validation data processed by the model."""

        inputs = numpy.array(inputs)
        labels = numpy.array(labels)

        validationLoss, summaries = self.session.run(
            [self.loss, self.mergedSummary],
            feed_dict={
                self.inputTokens: inputs,
                self.labels: labels
            })
        return validationLoss, summaries

    def createOptimizerStep(self, loss):
        """One step of backprop."""

        optimizer = tf.train.AdamOptimizer(learning_rate=float(
            self.config["model"]["learningRate"]),
                                           beta1=0.9,
                                           beta2=0.999,
                                           epsilon=numpy.finfo(float).eps,
                                           name="optimizer-step")

        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(
            gradients, self.config["model"]["gradientClippingFactor"])
        self.gradientNorm = tf.global_norm(gradients, name="gradient-norm")

        return optimizer.apply_gradients(zip(gradients, variables))

    def setupSummaries(self):
        tf.summary.scalar('cross-entropy', self.loss)
        tf.summary.scalar('gradient-norm', self.gradientNorm)

        self.mergedSummary = tf.summary.merge_all()

        self.trainingSummaryWriter = tf.summary.FileWriter(
            os.path.join(self.getExperimentDirectory(), 'training-summaries'),
            self.graph)

        if self.shouldRunValidation():
            self.validationSummaryWriter = tf.summary.FileWriter(
                os.path.join(self.getExperimentDirectory(),
                             'validation-summaries'), self.graph)

    def evaluateLoss(self, batchOutputs, labels):
        return tf.identity(tf.losses.sparse_softmax_cross_entropy(
            labels=labels, logits=batchOutputs),
                           name="loss")

    def processInputMiniBatch(self, inputs):
        return self.runEncoderDecoder(inputs, inputs)

    def runEncoderDecoder(self, inputSequence, historicSequence):
        # convert sequences to embeddings (output embeddings are Tensor(batch-size, sequence-length, hidden))
        inputEmbeddings = self.convertToEmbeddings(inputSequence)
        historicEmbeddings = self.convertToEmbeddings(historicSequence)

        # run encoder (encodedEmbeddings is (batch-size, sequence-length, hidden))
        encodedEmbeddings = self.runEncoder(inputEmbeddings)

        # run decoder (logits is Tensor(batch-size, sequence-length, vocab-size)
        logits = self.runDecoder(encodedEmbeddings, historicEmbeddings)

        return logits

    def convertToEmbeddings(self, sequenceIds):
        with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE):
            wordEmbeddingsGlobal = tf.get_variable('word-embeddings', \
                    [self.vocab.getSize(), self.getEmbeddingSize()])
        wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal,
                                                sequenceIds)
        return wordEmbeddings

    def runEncoder(self, embeddings):
        return tf.layers.dense(embeddings,
                               self.getEmbeddingSize(),
                               activation="relu")

    def runDecoder(self, inputEmbeddings, historicEmbeddings):
        return tf.layers.dense(
            tf.concat([inputEmbeddings, historicEmbeddings], axis=2),
            self.vocab.getSize())

    def checkpoint(self):
        """Creates a checkpoint of current model and saves to model
        directory.
        """

        directory = self.checkpointer.getModelDirectory()
        logger.debug("Saving checkpoint to: " + str(directory))

        self.checkpointer.checkpoint()
        exists = os.path.exists(directory)

        if exists:
            tempDirectory = directory + "-temp"
            shutil.move(directory, tempDirectory)

        with self.graph.as_default():
            tf.saved_model.simple_save(
                self.session,
                directory,
                inputs={"input-tokens": self.inputTokens},
                outputs={"output-probabilities": self.outputProbabilities})

        if exists:
            shutil.rmtree(tempDirectory)

    """Functions to load configuration parameters."""

    def getEmbeddingSize(self):
        return int(self.config["model"]["embeddingSize"])

    def shouldRunValidation(self):
        return self.config["model"]["runValidation"]

    def getEpochs(self):
        return int(self.config["model"]["epochs"])

    def getShouldCreateModel(self):
        if not "createNewModel" in self.config["model"]:
            return False
        return bool(self.config["model"]["createNewModel"])

    def getStepsPerEpoch(self):
        return int(self.config["model"]["stepsPerEpoch"])

    def getValidationStepsPerEpoch(self):
        return int(self.config["model"]["validationStepsPerEpoch"])

    def getExperimentDirectory(self):
        return self.config["model"]["directory"]
Beispiel #16
0
class LabelAdaptor:
    def __init__(self, config, source):
        self.config = config
        self.source = source
        self.secondSource = source.clone()
        self.secondSource.shuffleDocuments()
        self.random = numpy.random.RandomState(seed=self.getSeed())
        self.vocab = Vocab(config)

    def next(self):
        chunk = self.source.next()

        isFromSameSource = self.random.binomial(1, 0.5)

        if isFromSameSource:
            secondChunk = self.source.next()
        else:
            secondChunk = self.secondSource.next()

        chunk, documentId = zip(*chunk)
        secondChunk, secondDocumentId = zip(*secondChunk)

        labels = self.addTokenLabels(chunk, documentId)
        inputs = self.maskOffTokens(labels)

        secondLabels = self.addTokenLabels(secondChunk, secondDocumentId)
        secondInputs = self.maskOffTokens(secondLabels)

        return inputs, labels, secondInputs, secondLabels

    def addTokenLabels(self, chunk, documentIds):

        return [documentIds[0]] + list(chunk)

    def maskOffTokens(self, labels):
        inputs = list(labels)

        for i in range(1, len(labels)):
            if self.random.binomial(1, 0.15):
                if self.random.binomial(1, 0.8):
                    inputs[i] = Vocab.getMaskToken()
                else:
                    if self.random.binomial(1, 0.5):
                        inputs[i] = self.random.randint(Vocab.getVocabOffset(),
                            self.vocab.getSize())

        inputs[0] = Vocab.getClassLabelToken()

        return inputs

    def getSeed(self):
        if not "size" in self.config["adaptor"]["labels"]:
            return 122

        return int(self.config["adaptor"]["labels"]["seed"])

    def reset(self):
        self.random = numpy.random.RandomState(seed=self.getSeed())
        self.source.reset()
        self.secondSource.reset()
        self.secondSource.shuffleDocuments()

    def size(self):
        return self.source.size()

    def setMaximumSize(self, size):
        self.source.setMaximumSize(size)
class FallbackTokenEvaluator:
    def __init__(self, config):
        self.config = config
        self.vocab = Vocab(config)

    def initialize(self):
        self.perplexityStates = self.createPerplexityStates(
            self.getBatchSize())

    def evaluate(self, inputs, labels, predictions):
        inputIndices, predictions, vocabProbabilities = self.rewriteSplitTokens(
            inputs, labels, predictions)

        self.recordPredictions(predictions, vocabProbabilities, inputIndices,
                               inputs)

    def getRequestedPredictions(self, inputs, labels):
        return numpy.expand_dims(labels, axis=2)

    def finalize(self):
        return self.getPerplexity()

    def getBatchSize(self):
        if not "adaptor" in self.config:
            return 1

        if not "batching" in self.config["adaptor"]:
            return 1

        if not "size" in self.config["adaptor"]["batching"]:
            return 1

        return int(self.config["adaptor"]["batching"]["size"])

    def createPerplexityStates(self, count):
        return [PerplexityState(self.vocab) for i in range(count)]

    def getPerplexity(self):
        byteCount = sum(
            [state.getByteCount() for state in self.perplexityStates])
        totalEntropy = sum(
            [state.getEntropy() for state in self.perplexityStates])

        return 2.0**(totalEntropy / byteCount)

    def recordPredictions(self, predictions, vocabProbabilities, inputIndices,
                          inputs):
        # predictions is Tensor(batch-size, sequence-length, vocab-size)
        # inputs is Tensor(batch-size, sequence-length)
        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        # TODO: replace with something like batch gather
        for batch in range(batchSize):
            for element in range(sequenceLength):
                labelPrediction = predictions[batch, element]
                self.perplexityStates[batch].addPrediction(
                    inputs[batch, :], inputIndices[batch, element],
                    labelPrediction, vocabProbabilities[batch, element, :])

    def rewriteSplitTokens(self, inputs, labels, predictions):
        from functools import reduce

        newInputs = []
        newPredictions = []
        newVocabProbabilities = []

        batchSize = predictions.shape[0]
        sequenceLength = predictions.shape[1]

        # collapse expanded tokens
        for batch in range(batchSize):

            inputString = "".join([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ])
            reservedIndices = set([
                index for index, token in enumerate(labels[batch, :])
                if Vocab.isReservedToken(token)
            ])

            tokenizer = UnlimitedVocabTokenizerAdaptor(
                StringDataSource(inputString))

            completeTokens = [
                tokenizer.next() for i in range(tokenizer.size())
            ]

            logger.debug("Reformed input string: '" + str([
                self.vocab.getTokenString(token) for token in labels[batch, :]
                if not Vocab.isReservedToken(token)
            ]))
            logger.debug("' tokenized to: " + str(completeTokens))
            logger.debug(
                " tokens: " +
                str([self.vocab.getToken(token) for token in completeTokens]))

            index = 0
            completeTokenIndex = 0

            newBatchInputs = []
            newBatchPredictions = []
            newBatchVocabProbabilities = []

            while index < sequenceLength:
                token = labels[batch, index]
                completeToken = completeTokens[completeTokenIndex]

                # get token end
                tokenEndIndex = index + 1
                if self.vocab.getToken(
                        completeToken
                ) != token and not index in reservedIndices:
                    while tokenEndIndex < sequenceLength:
                        possibleToken = labels[batch, tokenEndIndex]
                        if (completeTokenIndex + 1) < len(completeTokens):
                            if self.vocab.getToken(
                                    completeTokens[completeTokenIndex +
                                                   1]) == possibleToken:
                                break
                        tokenEndIndex += 1

                # add token
                newBatchInputs.append([index, tokenEndIndex])
                newBatchVocabProbabilities.append(
                    list(predictions[batch, index, :]))
                newBatchVocabProbabilities[-1][0] = 0.0

                # compute new probabilities for the merged token
                predictionValues = predictions[batch, index:tokenEndIndex, 0]
                newBatchPredictions.append(
                    reduce(lambda x, y: x * y, predictionValues))

                if tokenEndIndex > (index + 1):
                    logger.debug("Reformed split tokens: " + str([
                        self.vocab.getTokenString(token)
                        for token in labels[batch, index:tokenEndIndex]
                    ]) + (" with prob: %.4f" % newBatchPredictions[-1]))

                if not index in reservedIndices:
                    completeTokenIndex += 1

                index = tokenEndIndex

            newInputs.append(newBatchInputs)
            newPredictions.append(newBatchPredictions)
            newVocabProbabilities.append(newBatchVocabProbabilities)

        # pad
        maxLength = max([len(tokens) for tokens in newInputs])

        newInputs = [
            inputs +
            [self.getPadToken() for i in range(maxLength - len(inputs))]
            for inputs in newInputs
        ]
        newPredictions = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newPredictions
        ]
        newVocabProbabilties = [
            predictions + [0.0 for i in range(maxLength - len(predictions))]
            for predictions in newVocabProbabilities
        ]

        return numpy.array(newInputs), numpy.array(
            newPredictions), numpy.array(newVocabProbabilities)
class UnigramModel:
    def __init__(self, config, trainingData, validationData):
        self.config = config
        self.trainingData = trainingData
        self.validationData = validationData
        self.checkpointer = ModelDescriptionCheckpointer(config, "UnigramModel")

        if not self.trainingData is None:
            self.trainingData.setMaximumSize(int(self.config["model"]["stepsPerEpoch"]))

        if not self.validationData is None:
            self.validationData.setMaximumSize(int(self.config["model"]["validationStepsPerEpoch"]))

        self.getOrLoadModel()

    def train(self):
        for epoch in range(self.getEpochs()):
            self.trainingData.reset()

            self.runOnTrainingDataset(epoch)

            if self.shouldRunValidation():
                self.trainingData.reset()
                self.runOnValidationDataset(epoch)

            self.checkpoint()

    def runOnTrainingDataset(self, epoch):
        import time
        trainStart = time.time()

        for step in range(self.getStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.trainingData.next()

            generatorEnd = time.time()

            trainStepStart = time.time()
            self.trainingStep(inputs, labels)
            trainStepEnd = time.time()

            message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) +
                "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) +
                "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) +
                ", training step time: " + ("%.2f" % (trainStepEnd - trainStepStart)))

            print(message, end="\r", flush=True)


        trainEnd = time.time()

        print(message)
        logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...")

    def trainingStep(self, inputs, labels):
        # just consider the labels
        for batch in range(labels.shape[0]):
            self.totalTokens += labels.shape[1]

            for token in range(labels.shape[1]):
                self.tokenCounts[labels[batch, token]] += 1

    def runOnValidationDataset(self, epoch):
        import time

        start = time.time()

        totalCrossEntropy = 0.0
        totalBytes = 0

        for step in range(self.getValidationStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.validationData.next()

            generatorEnd = time.time()

            stepStart = time.time()
            crossEntropy, byteCount = self.validationStep(inputs, labels)
            stepEnd = time.time()

            message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) +
                "), Step (" + str(step) + " / " + str(self.getValidationStepsPerEpoch()) +
                "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) +
                ", validation step time: " + ("%.2f" % (stepEnd - stepStart)) +
                ", loss is " + str(crossEntropy/tokens))

            print(message, end="\r", flush=True)

            totalCrossEntropy += crossEntropy
            totalBytes += byteCount

        end = time.time()

        print(message)
        logger.debug(" Validation took: " + (str(end - start)) + " seconds... cross entropy is " +
            str(totalCrossEntropy/totalBytes))

    def validationStep(self, inputs, labels):
        import math
        crossEntropy = 0.0
        byteCount = 0
        for batch in range(labels.shape[0]):
            for index in range(labels.shape[1]):
                token = labels[batch, token]
                tokenProbability = self.getTokenProbability(token)
                crossEntropy += -math.log(tokenProbability)
                byteCount = self.vocab.getTokenBytes()

        return crossEntropy, byteCount

    def getTokenProbability(self, token):
        count = self.tokenCounts[token]
        # TODO: Implement kneser ney smoothing
        return (count + 1.0) / (self.totalTokens + 1.0)

    def getOrLoadModel(self):
        import os

        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(
            self.checkpointer.getModelDirectory()) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.load()

    def createModel(self):
        self.tokenCounts = numpy.zeros(self.vocab.getSize())
        self.totalTokens = 0

    def checkpoint(self):
        import json
        import os
        import shutil

        directory = self.checkpointer.getModelDirectory()
        logger.debug("Saving checkpoint to: " + str(directory))

        self.checkpointer.checkpoint()

        exists = os.path.exists(directory)
        if exists:
            tempDirectory = directory + "-temp"

            shutil.move(directory, tempDirectory)

        os.makedirs(directory)
        with open(os.path.join(directory, "unigram-statistics.json"), "w") as jsonFile:
            json.dump([self.totalTokens, [i for i in self.tokenCounts]], jsonFile)

        if exists:
            shutil.rmtree(tempDirectory)

    def predict(self, inputs):
        batchSize = inputs.shape[0]
        length = inputs.shape[1]
        vocab = self.getVocab().getSize()

        probs = [self.getTokenProbability(token) for token in range(vocab)]

        return numpy.broadcast_to(numpy.array(probs), [batchSize, length, vocab])

    def load(self):
        import os
        import json

        self.checkpointer.load()

        directory = self.checkpointer.getModelDirectory()

        logger.debug("Loading checkpoint from: " + str(directory))
        with open(os.path.join(directory, "unigram-statistics.json"), "r") as jsonFile:
            self.totalTokens, counts = json.load(jsonFile)
            self.tokenCounts = numpy.array(counts)

    def getVocab(self):
        return self.vocab

    def getEpochs(self):
        return int(self.config["model"]["epochs"])

    def getShouldCreateModel(self):
        if not "createNewModel" in self.config["model"]:
            return False
        return bool(self.config["model"]["createNewModel"])

    def getStepsPerEpoch(self):
        return min(int(self.config["model"]["stepsPerEpoch"]), self.trainingData.size())

    def getValidationStepsPerEpoch(self):
        return min(int(self.config["model"]["validationStepsPerEpoch"]), self.validationData.size())

    def shouldRunValidation(self):
        if not "runValidation" in self.config["model"]:
            return True
        return bool(self.config["model"]["runValidation"])
 def isPredictedToken(self, token):
     return token == Vocab.getMaskToken() or token == Vocab.getVocabOffset()
    def groupDataIntoClusters(self):

        kmeans = MiniBatchKMeans(n_clusters=self.numberOfClusters)
        featurizer = Featurizer(self.config, self.validationDataset)
        vocab = Vocab(self.config)

        if self.usePCA():
            pca = IncrementalPCA(n_components=32)

        logger.info("Reducing dimensionality...")

        # fit the pca model
        if self.usePCA():
            for iteration in range(self.getIterations()):
                if iteration % 10 == 0:
                    logger.info(" " + str(iteration) + " / " +
                                str(self.getIterations()))
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

                pca.partial_fit(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            self.validationDataset.reset()

        logger.info("Fitting model...")

        # fit the kmeans model
        for iteration in range(self.getIterations()):
            if iteration % 10 == 0:
                inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch(
                    reportTime=True)
                logger.info(" " + str(iteration) + " / " +
                            str(self.getIterations()) + " data load time: " +
                            str(dataTime) + " model eval time: " +
                            str(modelTime))
            else:
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

            if self.usePCA():
                embeddings = pca.transform(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            kmeans.partial_fit(
                numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

        self.validationDataset.reset()

        # group into clusters
        # create a histogram of word frequencies per cluster
        clusterHistogram = {i: {} for i in range(self.numberOfClusters)}
        clusterWins = {i: 0 for i in range(self.numberOfClusters)}
        documentMap = {}

        logger.info("Clustering data...")

        for iteration in range(self.getIterations()):
            if iteration % 10 == 0:
                inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch(
                    reportTime=True)
                logger.info(" " + str(iteration) + " / " +
                            str(self.getIterations()) + " data load time: " +
                            str(dataTime) + " model eval time: " +
                            str(modelTime))
            else:
                inputs, labels, embeddings = featurizer.featurizeOneBatch()

            chunkLength = embeddings.shape[1]
            batchSize = embeddings.shape[0]

            if self.usePCA():
                embeddings = pca.transform(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1])))

            clusters = numpy.reshape(
                kmeans.predict(
                    numpy.reshape(embeddings, (-1, embeddings.shape[-1]))),
                (batchSize, chunkLength))

            for batch in range(batchSize):
                documentId = labels[batch, 0]

                if not documentId in documentMap:
                    documentMap[documentId] = []

                clusterIds = []

                for wordIndex in range(1, chunkLength):

                    word = vocab.getTokenString(labels[batch, wordIndex])
                    cluster = clusters[batch, wordIndex]

                    clusterIds.append(cluster)

                    if not labels[batch,
                                  wordIndex] in clusterHistogram[cluster]:
                        clusterHistogram[cluster][labels[batch, wordIndex]] = 0

                    clusterHistogram[cluster][labels[batch, wordIndex]] += 1
                    clusterWins[cluster] += 1

                documentMap[documentId].extend(clusterIds)

        if not os.path.exists(self.outputDirectory):
            os.makedirs(self.outputDirectory)

        # write histograms
        with open(self.getOutputHistogramFileName(), "w") as log:
            for clusterId, clusterCount in sorted(clusterWins.items(),
                                                  key=lambda x: x[1],
                                                  reverse=True):
                words = clusterHistogram[clusterId]
                log.write("Cluster, " + str(clusterId) + " (" +
                          str(clusterCount) + ")\n")
                for wordIndex, count in sorted(words.items(),
                                               key=lambda x: x[1],
                                               reverse=True):
                    log.write("    '" + vocab.getTokenString(wordIndex) +
                              "' " + str(count) + "\n")

        # write document clusters
        for documentId, clusters in documentMap.items():

            histogram = {}

            for cluster in clusters:
                if not cluster in histogram:
                    histogram[cluster] = 0

                histogram[cluster] += 1

            with open(self.getOutputDocumentClusterFileName(documentId),
                      "w") as log:

                for cluster, count in sorted(histogram.items(),
                                             key=lambda x: x[1],
                                             reverse=True):

                    words = clusterHistogram[cluster]
                    topWord = vocab.getTokenString(
                        sorted(words.items(), key=lambda x: x[1],
                               reverse=True)[0][0])
                    log.write("Cluster, " + str(cluster) + ", " + topWord +
                              ", " + str(count) + "\n")
class BERTModel:
    def __init__(self, config, trainingDataSource, validationDataSource):
        """Initializes the linear model object.

        Attributes:
            config: The configuration for the model.
            trainingDataSource: list of training samples and labels
            validationDataSource: list of validation samples and labels

        """
        self.config = config
        self.trainingDataSource = trainingDataSource
        self.validationDataSource = validationDataSource
        self.graph = tf.Graph()
        self.session = tf.Session(graph=self.graph)
        self.checkpointer = ModelDescriptionCheckpointer(config, "BERTModel")
        self.isLoaded = False

    def train(self):
        """Trains the linear model.

        Trains the model for epochs specified in the config.
        Runs the validation dataset on the model if specified in the config.
        """
        with self.graph.as_default():
            self.getOrLoadModel()

        for epoch in range(self.getEpochs()):
            self.runOnTrainingDataset(epoch)

            if self.shouldRunValidation():
                self.runOnValidationDataset(epoch)

            self.checkpoint()

    def predict(self, inputs, requestedPredictions):
        with self.graph.as_default():
            self.getOrLoadModel()

        inputs = numpy.array(inputs)

        predictions = self.session.run(self.outputProbabilities,
                                       feed_dict={self.inputTokens: inputs})

        batchSize = requestedPredictions.shape[0]
        length = requestedPredictions.shape[1]

        outputPredictions = numpy.zeros(requestedPredictions.shape)

        for b in range(batchSize):
            for l in range(length):
                outputPredictions[b,l,:] = \
                    predictions[b,l,requestedPredictions[b,l,:]]
        return outputPredictions

    def getOrLoadModel(self):
        """Returns a linear model.

        If specified, create a new model else load an already existing model.
        """
        self.vocab = Vocab(self.config)

        shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory(
        )) or self.getShouldCreateModel()

        if shouldCreate:
            self.createModel()
        else:
            self.loadModel()

    def loadModel(self):
        """Loads an already existing model from the specified path """
        if self.isLoaded:
            return

        self.checkpointer.load()

        directory = self.checkpointer.getModelDirectory()

        logger.debug("Loading checkpoint from: " + str(directory))

        tf.saved_model.loader.load(self.session, ["serve"], directory)

        self.setOperationsByName()

        self.isLoaded = True

    def setOperationsByName(self):
        self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0")
        self.labels = self.graph.get_tensor_by_name("output-labels:0")
        self.outputProbabilities = \
            self.graph.get_tensor_by_name("output-probabilities:0")
        self.loss = self.graph.get_tensor_by_name("loss:0")
        self.optimizerStep = self.graph.get_operation_by_name("optimizer-step")

    def createModel(self):
        ## (batch, sequence-length, 1)
        self.inputTokens = tf.placeholder(tf.int32,
                                          shape=(None, None),
                                          name="input-tokens")

        self.labels = tf.placeholder(tf.int32,
                                     shape=(None, None),
                                     name="output-labels")

        predictedLogits = self.processInputMiniBatch(self.inputTokens)
        self.loss = self.evaluateLoss(predictedLogits, self.labels)
        self.outputProbabilities = tf.nn.softmax(predictedLogits,
                                                 name="output-probabilities")

        # optimizer
        self.optimizerStep = self.createOptimizerStep(self.loss)

        # initializers
        self.globalInitializer = tf.global_variables_initializer()
        self.localInitializer = tf.local_variables_initializer()

        # summaries
        self.setupSummaries()

        # do the initialization
        self.initializeModel()

    def initializeModel(self):
        self.session.run(self.globalInitializer)
        self.session.run(self.localInitializer)

    def runOnTrainingDataset(self, epoch):
        """Trains the linear model on the training dataset for one epoch."""
        trainStart = time.time()

        for step in range(self.getStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.trainingDataSource.next()

            generatorEnd = time.time()

            trainStepStart = time.time()
            loss, gradNorm = self.trainingStep(inputs, labels, step)
            trainStepEnd = time.time()

            message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) +
                       "), Step (" + str(step) + " / " +
                       str(self.getStepsPerEpoch()) + "), Generator time: " +
                       ("%.2f" % (generatorEnd - generatorStart)) +
                       ", training step time: " +
                       ("%.2f" % (trainStepEnd - trainStepStart) + ", loss: " +
                        str("%.2f" % loss) + ", grad norm: " +
                        str("%.2f" % gradNorm)))

            print(message, end="\r", flush=True)

        trainEnd = time.time()

        print(message)
        logger.debug(" Training took: " + (str(trainEnd - trainStart)) +
                     " seconds...")

    def trainingStep(self, inputs, labels, step):
        """Training step for one minibatch of training data."""
        inputs = numpy.array(inputs)
        labels = numpy.array(labels)

        trainingLoss, gradNorm, summaries, _ = self.session.run(
            [
                self.loss, self.gradientNorm, self.mergedSummary,
                self.optimizerStep
            ],
            feed_dict={
                self.inputTokens: inputs,
                self.labels: labels
            })

        self.trainingSummaryWriter.add_summary(summaries, step)
        return trainingLoss, gradNorm

    def runOnValidationDataset(self, epoch):
        """Runs the linear model on the validation dataset for one epoch."""

        validationStart = time.time()

        for step in range(self.getValidationStepsPerEpoch()):
            generatorStart = time.time()

            inputs, labels = self.validationDataSource.next()

            generatorEnd = time.time()

            validationStepStart = time.time()
            loss, summary = self.validationStep(inputs, labels)
            validationStepEnd = time.time()

            message = ("Validation Step (" + str(step) + " / " +
                       str(self.getValidationStepsPerEpoch()) +
                       "), Generator time: " +
                       ("%.2f" % (generatorEnd - generatorStart)) +
                       ", validation step time: " +
                       ("%.2f" % (validationStepEnd - validationStepStart) +
                        ", loss: " + str(loss)))

            print(message, end="\r", flush=True)

        validationEnd = time.time()

        print(message)
        logger.debug(" Validation took: " +
                     (str(validationEnd - validationStart)) + " seconds...")

    def validationStep(self, inputs, labels):
        """One minibatch of validation data processed by the model."""

        inputs = numpy.array(inputs)
        labels = numpy.array(labels)

        validationLoss, summaries = self.session.run(
            [self.loss, self.mergedSummary],
            feed_dict={
                self.inputTokens: inputs,
                self.labels: labels
            })
        return validationLoss, summaries

    def createOptimizerStep(self, loss):
        """One step of backprop."""

        optimizer = tf.train.AdamOptimizer(learning_rate=float(
            self.config["model"]["learningRate"]),
                                           beta1=0.9,
                                           beta2=0.999,
                                           epsilon=numpy.finfo(float).eps,
                                           name="optimizer-step")

        gradients, variables = zip(*optimizer.compute_gradients(loss))
        gradients, _ = tf.clip_by_global_norm(
            gradients, self.config["model"]["gradientClippingFactor"])
        self.gradientNorm = tf.global_norm(gradients, name="gradient-norm")

        return optimizer.apply_gradients(zip(gradients, variables))

    def setupSummaries(self):
        tf.summary.scalar('cross-entropy', self.loss)
        tf.summary.scalar('gradient-norm', self.gradientNorm)

        self.mergedSummary = tf.summary.merge_all()

        self.trainingSummaryWriter = tf.summary.FileWriter(
            os.path.join(self.getExperimentDirectory(), 'training-summaries'),
            self.graph)

        if self.shouldRunValidation():
            self.validationSummaryWriter = tf.summary.FileWriter(
                os.path.join(self.getExperimentDirectory(),
                             'validation-summaries'), self.graph)

    def evaluateLoss(self, batchOutputs, labels):
        return tf.identity(tf.losses.sparse_softmax_cross_entropy(
            labels=labels, logits=batchOutputs),
                           name="loss")

    def processInputMiniBatch(self, inputs):
        return self.runBERT(inputs, inputs)

    def runBERT(self, inputSequence, historicSequence):
        # convert sequences to embeddings (output embeddings are Tensor(batch-size, sequence-length, hidden))
        inputEmbeddings = self.convertToEmbeddings(inputSequence)
        inputEmbeddingsPositionallyEncoded = self.getPositionalEncodings(
            inputEmbeddings)

        # run encoder (encodedEmbeddings is (batch-size, sequence-length, hidden))
        encodedEmbeddings = self.runEncoder(inputEmbeddingsPositionallyEncoded)
        return tf.layers.dense(encodedEmbeddings, units=self.vocab.getSize())

    def convertToEmbeddings(self, sequenceIds):
        with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE):
            wordEmbeddingsGlobal = tf.get_variable('word-embeddings', \
                    [self.vocab.getSize(), self.getEmbeddingSize()])
        wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal,
                                                sequenceIds)
        return wordEmbeddings

    def getPositionalEncodings(self, inputEmbeddings):
        #PE(pos,2i)=sin(pos/100002i/dmodel)PE(pos,2i)=sin(pos/100002i/dmodel)
        #PE(pos,2i+1)=cos(pos/100002i/dmodel)PE(pos,2i+1)=cos(pos/100002i/dmodel) where pospos is the position and ii is the dimension.
        batchSize = tf.shape(inputEmbeddings)[0]
        sequenceLength = tf.shape(inputEmbeddings)[1]
        hiddenDimension = tf.shape(inputEmbeddings)[2]

        sequenceRange = tf.reshape(
            tf.range(tf.cast(sequenceLength, tf.float32)),
            (1, sequenceLength, 1))
        hiddenRange = tf.reshape(
            tf.range(tf.cast(hiddenDimension, tf.float32)),
            (1, 1, hiddenDimension))
        rawPE = sequenceRange / (tf.pow(
            10000.0, 2.0 * hiddenRange / tf.cast(hiddenDimension, tf.float32)))

        PE_cos = tf.cos(rawPE[:, 0::2, :])
        PE_sin = tf.sin(rawPE[:, 1::2, :])

        PE_cos = tf.reshape(
            PE_cos, (batchSize, tf.shape(PE_cos)[1], 1, hiddenDimension))
        PE_sin = tf.reshape(
            PE_sin, (batchSize, tf.shape(PE_sin)[1], 1, hiddenDimension))

        PE = tf.concat([PE_cos, PE_sin], axis=2)

        return inputEmbeddings + tf.reshape(
            PE, (batchSize, sequenceLength, hiddenDimension))

    def runEncoder(self, embeddings):
        for i in range(self.getLayerCount()):
            right = self.multiHeadedAttention(embeddings)
            left = tf.layers.dense(right, units=embeddings.shape[-1])
            embeddings = self.addAndNorm(left, right)
        return embeddings

    def multiHeadedAttention(self, embeddings):
        # Q,K,V are all -> projected embeddings
        projectedEmbeddings = self.projectEmbeddings(embeddings)
        attentionResults = self.attention(projectedEmbeddings)
        left = self.projectAttentionOutput(attentionResults)
        return self.addAndNorm(left, embeddings)

    def projectEmbeddings(self, embeddings):
        #input -> m, seqL, embedding size
        #output -> m, seqL, 3 * numberOfAttentionHeads * embedding size
        retVal = tf.layers.dense(embeddings,
                                 units=3 * self.getAttentionHeads() *
                                 embeddings.shape[-1])

        return tf.reshape(retVal, [
            tf.shape(retVal)[0],
            tf.shape(retVal)[1], 3,
            self.getAttentionHeads(), embeddings.shape[-1]
        ])

    def attention(self, projectedEmbeddings):
        #m, seqL, (Q, K, V), attentionHeads, embedding size
        Q = projectedEmbeddings[:, :, 0, :, :]
        K = projectedEmbeddings[:, :, 1, :, :]
        V = projectedEmbeddings[:, :, 2, :, :]
        d_k = int(projectedEmbeddings.shape[-1])

        m1 = tf.matmul(Q, K, transpose_b=True) / math.sqrt(d_k)
        smx = tf.nn.softmax(m1)
        return tf.matmul(smx, V)

    def projectAttentionOutput(self, attentionResults):
        #attentionResults -> m, seqL, attentionHeads, embedding size
        #new shape is (batch, sequence length, heads * embedding-size)
        batchSize = tf.shape(attentionResults)[0]
        sequenceLength = tf.shape(attentionResults)[1]

        reshapedEmbeddings = tf.reshape(
            attentionResults,
            (batchSize, sequenceLength,
             attentionResults.shape[-1] * attentionResults.shape[-2]))
        return tf.layers.dense(reshapedEmbeddings,
                               units=attentionResults.shape[-1])

    def addAndNorm(self, left, right):
        normalizedLeft = tf.contrib.layers.layer_norm(left)
        return tf.add(normalizedLeft, right)

    def checkpoint(self):
        """Creates a checkpoint of current model and saves to model
        directory.
        """

        directory = self.checkpointer.getModelDirectory()
        logger.debug("Saving checkpoint to: " + str(directory))

        self.checkpointer.checkpoint()
        exists = os.path.exists(directory)

        if exists:
            tempDirectory = directory + "-temp"
            shutil.move(directory, tempDirectory)

        with self.graph.as_default():
            tf.saved_model.simple_save(
                self.session,
                directory,
                inputs={"input-tokens": self.inputTokens},
                outputs={"output-probabilities": self.outputProbabilities})

        if exists:
            shutil.rmtree(tempDirectory)

    """Functions to load configuration parameters."""

    def getEmbeddingSize(self):
        return int(self.config["model"]["embeddingSize"])

    def shouldRunValidation(self):
        return self.config["model"]["runValidation"]

    def getEpochs(self):
        return int(self.config["model"]["epochs"])

    def getShouldCreateModel(self):
        if not "createNewModel" in self.config["model"]:
            return False
        return bool(self.config["model"]["createNewModel"])

    def getStepsPerEpoch(self):
        return int(self.config["model"]["stepsPerEpoch"])

    def getValidationStepsPerEpoch(self):
        return int(self.config["model"]["validationStepsPerEpoch"])

    def getLayerCount(self):
        return self.config["model"]["layerCount"]

    def getAttentionHeads(self):
        return self.config["model"]["attentionHeads"]

    def getExperimentDirectory(self):
        return self.config["model"]["directory"]