def __init__(self, config, source): self.config = config self.source = source self.secondSource = source.clone() self.secondSource.shuffleDocuments() self.random = numpy.random.RandomState(seed=self.getSeed()) self.vocab = Vocab(config)
def getOrLoadModel(self): import os self.vocab = Vocab(self.config) shouldCreate = not os.path.exists( self.checkpointer.getModelDirectory()) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.load()
def getOrLoadModel(self): """Returns a linear model. If specified, create a new model else load an already existing model. """ self.vocab = Vocab(self.config) shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory( )) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.loadModel()
def maskOffTokens(self, labels): inputs = list(labels) for i in range(1, len(labels)): if self.random.binomial(1, 0.15): if self.random.binomial(1, 0.8): inputs[i] = Vocab.getMaskToken() else: if self.random.binomial(1, 0.5): inputs[i] = self.random.randint(Vocab.getVocabOffset(), self.vocab.getSize()) inputs[0] = Vocab.getClassLabelToken() return inputs
class PerTokenEvaluator: def __init__(self, config): self.config = config self.vocab = Vocab(config) def initialize(self): self.entropy = 0.0 self.totalBytes = 0 def evaluate(self, inputs, labels, predictions): import math batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] for batch in range(batchSize): for token in range(sequenceLength): p = predictions[batch, token, 0] tokenBytes = self.vocab.getTokenBytes(token) self.entropy += (-math.log(p)) / tokenBytes self.totalBytes += tokenBytes def getRequestedPredictions(self, inputs, labels): import numpy return numpy.expand_dims(labels, axis=2) def finalize(self): return 2**(self.entropy / self.totalBytes)
def saveVocab(dataset, size, directory): import os import time vocab = createInitialVocab() if os.path.isdir(directory): outputPath = os.path.join(directory, "vocab.txt") if not os.path.exists(directory): os.makedirs(directory) else: outputPath = directory previousVocabSize = 0 start = time.time() totalTokens = 0 while True: string = dataset.next() if len(string) == 0: break if not string in vocab: vocab[string] = 0 totalTokens += 1 vocab[string] += 1 if len(vocab) + Vocab.getVocabOffset( ) >= previousVocabSize + size * 0.01: previousVocabSize = len(vocab) + Vocab.getVocabOffset() logger.debug("Vocab size is " + str(previousVocabSize) + " time so far: " + str(time.time() - start) + " total tokens: " + str(totalTokens)) if len(vocab) + Vocab.getVocabOffset() >= size: break with open(outputPath, "w", encoding='utf-8') as outputFile: for token, count in reversed(sorted(vocab.items(), key=lambda x: x[1])): if token[-1] != '\n': token += '\n' outputFile.write(token)
def next(self): tokenString = self.source.next() if self.vocab.contains(tokenString): token = self.vocab.getToken(tokenString) else: token = Vocab.getUnkToken() return token
class RandomModel: def __init__(self, config): self.config = config self.vocab = Vocab(config) def train(self): # no training happens in this model pass def predict(self, inputs): # output is [batch-size, sequence-length, vocab-size] of 1.0/vocab-size batchSize = inputs.shape[0] sequenceLength = inputs.shape[1] vocabSize = self.getVocabSize() return numpy.full([batchSize, sequenceLength, vocabSize], 1.0 / vocabSize) def getVocabSize(self): return self.vocab.getSize() def getVocab(self): return self.vocab
class ClassTransformerModel: def __init__(self, config, trainingDataSource, validationDataSource): """Initializes the model. Attributes: config: The configuration for the model. trainingDataSource: list of training samples and labels validationDataSource: list of validation samples and labels """ self.config = config self.trainingDataSource = trainingDataSource self.validationDataSource = validationDataSource self.graph = tf.Graph() self.session = tf.Session(graph=self.graph) self.checkpointer = ModelDescriptionCheckpointer(config, self.__class__.__name__) self.isLoaded = False self.bestValidationLoss = None def train(self): """Trains the model. Trains the model for epochs specified in the config. Runs the validation dataset on the model if specified in the config. """ with self.graph.as_default(): self.getOrLoadModel() for epoch in range(self.getEpochs()): self.runOnTrainingDataset(epoch) if self.shouldRunValidation(): self.runOnValidationDataset(epoch) self.validationDataSource.reset() self.checkpointBestModel() self.trainingDataSource.reset() def checkpointBestModel(self): if self.bestValidationLoss is None: self.checkpoint("best") return if self.totalLoss < self.bestValidationLoss: logger.info("Updating best model with loss: " + str(self.totalLoss)) self.bestValidationLoss = self.totalLoss self.checkpoint("best") else: self.checkpoint("checkpoint") def predict(self, inputs, requestedPredictions): with self.graph.as_default(): self.getOrLoadModel() assert False, "Not Implemented" inputs = numpy.array(inputs) predictions = self.session.run(self.outputProbabilities, feed_dict={self.inputTokens : inputs}) batchSize = requestedPredictions.shape[0] length = requestedPredictions.shape[1] outputPredictions = numpy.zeros(requestedPredictions.shape) for b in range(batchSize): for l in range(length): outputPredictions[b,l,:] = \ predictions[b,l,requestedPredictions[b,l,:]] return outputPredictions def getFeatures(self, inputs, secondInputs): with self.graph.as_default(): self.getOrLoadModel() inputs = numpy.expand_dims(numpy.array(inputs), axis=2) secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2) inputs = numpy.concatenate([inputs, secondInputs], axis=2) # (batch, sequence, 2, embedding-size) predictions = self.session.run(self.features, feed_dict={self.inputTokens : inputs}) return predictions[:, :, 0, :] def getOrLoadModel(self): """Returns a linear model. If specified, create a new model else load an already existing model. """ if self.isLoaded: return self.vocab = Vocab(self.config) shouldCreate = not os.path.exists( self.checkpointer.getModelLoadDirectory()) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.loadModel() self.logModel() def logModel(self): totalParameters = 0 for variable in tf.trainable_variables(): shape = variable.get_shape() variableParameters = 1 for dim in shape: variableParameters *= dim.value totalParameters += variableParameters logger.debug("Variable '" + variable.name + "' " + str(humanize.naturalsize(variableParameters)) + " (params) " + str(shape) + " (dims)") logger.debug("Total #params '" + str(humanize.naturalsize(totalParameters)) + "' ") def loadModel(self): """Loads an already existing model from the specified path """ self.checkpointer.load() directory = self.checkpointer.getModelLoadDirectory() logger.debug("Loading checkpoint from: " + str(directory)) tf.saved_model.loader.load( self.session, ["serve"], directory ) self.setOperationsByName() self.isLoaded = True def setOperationsByName(self): self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0") self.labels = self.graph.get_tensor_by_name("output-labels:0") self.features = self.graph.get_tensor_by_name("features:0") self.vocabLoss = self.graph.get_tensor_by_name("vocab-loss:0") self.classificationLoss = self.graph.get_tensor_by_name("classification-loss:0") self.classLoss = self.graph.get_tensor_by_name("class-loss:0") self.outputProbabilities = self.graph.get_tensor_by_name("output-probabilities:0") self.outputDocumentClass = self.graph.get_tensor_by_name("output-document-class:0") self.loss = self.graph.get_tensor_by_name("loss:0") self.optimizerStep = self.graph.get_operation_by_name("optimizer-step") def createModel(self): # inputs (batch, sequence-length, 2) self.inputTokens = tf.placeholder(tf.int32, shape=(None, None, 2), name="input-tokens") # labels (batch, sequence-length, 2) self.labels = tf.placeholder(tf.int32, shape=(None, None, 2), name="output-labels") self.createClassMappings() # convert to classes (batch, sequence-length, 2, assignments) self.inputClasses = self.convertToClasses(self.inputTokens) self.classLabels = self.convertToClasses(self.labels) # class logits (batch, sequence-length, 2, assignmets, class-size) classLogits = self.runClassModel(self.inputClasses) # classification logits (batch, sequence-length, 2, assignments, 2) classificationLogits = self.runClassificationModel() # document classification logits (batch, sequence-length, 2, assignments, 2) documentClassificationLogits = self.runDocumentClassificationModel() # compute the losses self.clusterLoss = tf.identity(self.evaluateClusteringLoss( self.features, self.classLabels), name="clustering-loss") self.classificationLoss = tf.identity(self.evaluateClassificationLoss( classificationLogits, self.classLabels), name="classification-loss") self.documentClassificationLoss = tf.identity(self.evaluateDocumentClassificationLoss( documentClassificationLogits, self.classLabels), name="document-classification-loss") self.classLoss = tf.identity(self.evaluateLoss(classLogits[:, 1:, :, :, :], self.classLabels[:, 1:, :, :]), name="class-loss") self.vocabLoss = tf.identity(self.evaluateVocabLoss(classLogits[:, 1:, :, :, :], self.labels[:, 1:, :]), name="vocab-loss") self.loss = tf.identity(self.classLoss + self.classificationLoss + self.clusterLoss + self.vocabLoss, name="loss") # convert to vocab logits (batch, sequence-length, vocab-size) vocabLogits = self.expandClassLogitsToVocab(classLogits) self.outputProbabilities = tf.nn.softmax(vocabLogits, name="output-probabilities") self.outputDocumentClass = tf.reduce_max(documentClassificationLogits, axis=3) # optimizer self.optimizerStep = self.createOptimizerStep(self.loss, "") self.documentOptimizerStep = self.createOptimizerStep(self.documentClassificationLoss, "document") # initializers self.globalInitializer = tf.global_variables_initializer() self.localInitializer = tf.local_variables_initializer() # summaries self.setupSummaries() # do the initialization self.initializeModel() def createClassMappings(self): mappings = numpy.zeros([self.getAssignmentCount(), self.vocab.getSize()], dtype=numpy.int32) weights = numpy.zeros([self.getAssignmentCount(), self.vocab.getSize()], dtype=numpy.float32) for assignment in range(self.getAssignmentCount()): mappings[assignment, :], weights[assignment, :] = self.createMapping(assignment) self.classMappingsHost = mappings self.classMappings = tf.constant(mappings) self.classWeights = tf.constant(weights) def logAdd(self, left, right): if left is None: return right if left == float("-inf"): return right if right == float("-inf"): return left return max(left, right) + math.log1p(math.exp( -math.fabs(left - right))) def logSumArray(self, array): from functools import reduce return reduce(lambda x, y : self.logAdd(x, y), array) def logSubtract(self, left, right): if left <= right: assert False, "log of negative number in subtraction " + str(left) + " - " + str(right) if right == float("-inf"): return left return left + math.log1p(-math.exp(right - left)) def createMapping(self, assignment): assert self.getNumberOfDirectClasses() <= self.getNumberOfClasses() assert self.getNumberOfDirectClasses() <= self.vocab.getSize() vocabSize = self.vocab.getSize() - self.getNumberOfDirectClasses() numberOfClasses = self.getNumberOfClasses() - self.getNumberOfDirectClasses() directMapping = numpy.arange(self.getNumberOfDirectClasses(), dtype=numpy.int32) directWeights = numpy.ones(self.getNumberOfDirectClasses(), dtype=numpy.float32) mapping, weights = self.createLogMapping(assignment, vocabSize, numberOfClasses) return (numpy.concatenate([directMapping, self.getNumberOfDirectClasses() + mapping]), numpy.concatenate([directWeights, weights])) def createLogMapping(self, assignment, vocabSize, numberOfClasses): generator = numpy.random.RandomState(seed=assignment) wordCounts = reversed([i * self.getWordFrequencyPowerLawExponent() for i in range(vocabSize)]) wordCountsPlusRandom = [i + math.log(generator.uniform(0.0, 1000.0)) for i in wordCounts] logTotalCount = self.logSumArray(wordCountsPlusRandom) sortedWordCounts = sorted(enumerate(wordCountsPlusRandom), key=lambda x: x[1], reverse=True) logClassSize = logTotalCount - math.log(numberOfClasses) mapping = numpy.zeros([vocabSize], dtype=numpy.int32) weights = numpy.zeros([vocabSize], dtype=numpy.float32) currentClass = 0 wordsInCurrentClass = 0 logCurrentCount = None for wordIndex, logWordCount in sortedWordCounts: assert currentClass < numberOfClasses mapping[wordIndex] = currentClass wordsInCurrentClass += 1 logCurrentCount = self.logAdd(logCurrentCount, logWordCount) if logCurrentCount >= logClassSize and currentClass + 1 != numberOfClasses: #print(logCurrentCount, logWordCount, currentClass, logClassSize) logCurrentCount = self.logSubtract(logCurrentCount, logClassSize) wordsInCurrentClass = 0 currentClass += 1 currentClass = 0 currentClassSize = 0 currentClassMembers = [] for i, wordCountAndIndex in enumerate(sortedWordCounts): wordIndex, wordCount = wordCountAndIndex currentClassMembers.append(wordIndex) currentClassSize += 1 # if end of current class if ((1 + i) == len(sortedWordCounts) or mapping[sortedWordCounts[1 + i][0]] != currentClass): for memberIndex in currentClassMembers: weights[memberIndex] = 1.0 / currentClassSize if currentClass == 0 or i == (len(sortedWordCounts) - 1): logger.info("current class " + str(currentClass) + " members " + str(len(currentClassMembers))) currentClass += 1 currentClassSize = 0 currentClassMembers = [] return mapping, weights def initializeModel(self): self.session.run(self.globalInitializer) self.session.run(self.localInitializer) def runOnTrainingDataset(self, epoch): """Trains the linear model on the training dataset for one epoch.""" trainStart = time.time() totalLoss = 0.0 message = None for step in range(self.getStepsPerEpoch()): generatorStart = time.time() try: inputs, labels, secondInputs, secondLabels = self.trainingDataSource.next() except Exception as e: if message is None: message = str(e) break generatorEnd = time.time() trainStepStart = time.time() loss, gradNorm = self.trainingStep(inputs, labels, secondInputs, secondLabels, step, epoch) trainStepEnd = time.time() totalLoss += loss message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) + "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", training step time: " + ("%.2f" % (trainStepEnd - trainStepStart) + ", loss: " + str("%.2f" % loss) + ", grad norm: " + str("%.2f" % gradNorm)) + ", avg-loss: " + str("%.2f" % (totalLoss / (step + 1)))) print(message, end="\r", flush=True) trainEnd = time.time() print(message) logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...") def trainingStep(self, inputs, labels, secondInputs, secondLabels, step, epoch): """Training step for one minibatch of training data.""" inputs = numpy.expand_dims(numpy.array(inputs), axis=2) labels = numpy.expand_dims(numpy.array(labels), axis=2) secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2) secondLabels = numpy.expand_dims(numpy.array(secondLabels), axis=2) inputs = numpy.concatenate([inputs, secondInputs], axis=2) labels = numpy.concatenate([labels, secondLabels], axis=2) if self.getShouldClassifyDocument(): optimizerStep = self.documentOptimizerStep loss = self.documentClassificationLoss else: optimizerStep = self.optimizerStep loss = self.loss trainingLoss, gradNorm, summaries, _ = self.session.run([loss, self.gradientNorm, self.mergedSummary, optimizerStep], feed_dict={self.inputTokens : inputs, self.labels : labels }) if step % self.getStepsPerTensorboardLog(): self.trainingSummaryWriter.add_summary(summaries, step + epoch * self.getStepsPerEpoch()) return trainingLoss, gradNorm def runOnValidationDataset(self, epoch): """Runs the linear model on the validation dataset for one epoch.""" validationStart = time.time() self.totalLoss = 0.0 self.totalVocabLoss = 0.0 message = None for step in range(self.getValidationStepsPerEpoch()): generatorStart = time.time() try: inputs, labels, secondInputs, secondLabels = self.validationDataSource.next() except Exception as e: if message is None: message = str(e) break generatorEnd = time.time() validationStepStart = time.time() loss, vocabLoss = self.validationStep(inputs, labels, secondInputs, secondLabels) validationStepEnd = time.time() self.totalLoss += loss self.totalVocabLoss += vocabLoss message = ("Validation Step (" + str(step) + " / " + str(self.getValidationStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", validation step time: " + ("%.2f" % (validationStepEnd - validationStepStart)) + ", avg-loss: " + ("%.2f" % (self.totalLoss/(step + 1)))) print(message, end="\r", flush=True) validationEnd = time.time() print(message) logger.debug(" Validation took: " + (str(validationEnd - validationStart)) + " seconds...") self.addValidationSummaries(self.totalLoss, self.totalVocabLoss, epoch) def addValidationSummaries(self, totalLoss, vocabLoss, epoch): averageLoss = totalLoss / self.getValidationStepsPerEpoch() summary = tf.Summary(value=[ tf.Summary.Value(tag="validation-loss", simple_value=averageLoss), ]) self.trainingSummaryWriter.add_summary(summary, epoch) averageVocabLoss = vocabLoss / self.getValidationStepsPerEpoch() summary = tf.Summary(value=[ tf.Summary.Value(tag="validation-vocab-cross-entropy", simple_value=averageVocabLoss), ]) self.trainingSummaryWriter.add_summary(summary, epoch) def validationStep(self, inputs, labels, secondInputs, secondLabels): """One minibatch of validation data processed by the model.""" inputs = numpy.expand_dims(numpy.array(inputs), axis=2) labels = numpy.expand_dims(numpy.array(labels), axis=2) secondInputs = numpy.expand_dims(numpy.array(secondInputs), axis=2) secondLabels = numpy.expand_dims(numpy.array(secondLabels), axis=2) inputs = numpy.concatenate([inputs, secondInputs], axis=2) labels = numpy.concatenate([labels, secondLabels], axis=2) if self.getShouldClassifyDocument(): loss = self.documentClassificationLoss else: loss = self.loss validationLoss, vocabLoss = self.session.run([loss, self.vocabLoss], feed_dict={self.inputTokens : inputs, self.labels : labels}) return validationLoss, vocabLoss def createOptimizerStep(self, loss, name): """One step of backprop.""" optimizer = tf.train.AdamOptimizer( learning_rate=float(self.config["model"]["learning-rate"]), beta1=0.9, beta2=0.98, epsilon=10e-9, name=name+"optimizer-step") gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, self.config["model"]["gradient-clipping-factor"]) self.gradientNorm = tf.global_norm(gradients, name="gradient-norm") return optimizer.apply_gradients(zip(gradients, variables)) def setupSummaries(self): tf.summary.scalar('total-loss', self.loss) if self.getShouldClassifyDocument(): tf.summary.scalar('document-class-cross-entropy', self.documentClassificationLoss) else: tf.summary.scalar('document-match-cross-entropy', self.classificationLoss) tf.summary.scalar('vocab-cross-entropy', self.vocabLoss) tf.summary.scalar('class-cross-entropy', self.classLoss) tf.summary.scalar('cluster-loss', self.clusterLoss) tf.summary.scalar('gradient-norm', self.gradientNorm) self.mergedSummary = tf.summary.merge_all() self.trainingSummaryWriter = tf.summary.FileWriter( os.path.join(self.getExperimentDirectory(), 'training-summaries'), self.graph) #if self.shouldRunValidation(): # self.validationSummaryWriter = tf.summary.FileWriter( # os.path.join(self.getExperimentDirectory(), 'validation-summaries'), # self.graph) def evaluateClusteringLoss(self, features, classLabels): # features is [batch, sequence, 2, assignments, feature-dimension] # class labels is [batch, sequence, 2, assignments] assignmentLosses = [] batchSize = tf.shape(features)[0] sequenceLength = tf.shape(features)[1] features = tf.reshape(self.features, (batchSize, sequenceLength, 2, self.getAssignmentCount(), self.getEmbeddingSize())) for i in range(self.getAssignmentCount()): assignmentLosses.append(self.evaluatePerAssignmentClusterLoss( features[:, :, :, i, :], classLabels[:, :, :, i])) return sum(assignmentLosses) / (tf.multiply(tf.cast(batchSize, dtype=tf.float32), 2.0 * self.getAssignmentCount())) def evaluatePerAssignmentClusterLoss(self, features, labels): # features is [batch, sequence, 2, feature-dim] # labels is [batch, sequence, 2] wordFeatures = tf.reshape(features[:, 0, :, :], (-1, self.getEmbeddingSize())) tripletLabels = tf.reshape(labels[:, 0, :], (-1, )) return self.tripletLoss(wordFeatures, tripletLabels) def tripletLoss(self, features, labels): return tf.contrib.losses.metric_learning.triplet_semihard_loss(labels, features) def evaluateClassificationLoss(self, batchOutputs, labels): # batch outputs is [batch, assignments, 2] # labels is [batch, sequence, 2, assignments, 1] labels = tf.cast(tf.equal(labels[:, 0, 0, :, 0], labels[:, 0, 1, :, 0]), tf.int32) return tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=batchOutputs) def evaluateDocumentClassificationLoss(self, batchOutputs, labels): # batch outputs is [batch, 2, assignments, 2] # labels is [batch, sequence, 2, assignments, 1] labels = labels[:,0,:,:,0] return tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=batchOutputs) def evaluateLoss(self, batchOutputs, labels): return tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=batchOutputs) def klDivergence(self, a, b): a = tf.distributions.Categorical(probs=a + numpy.finfo(float).eps) b = tf.distributions.Categorical(probs=tf.nn.softmax(b) + numpy.finfo(float).eps) return tf.reduce_mean(tf.distributions.kl_divergence(a, b, allow_nan_stats=False)) def convertToClasses(self, inputs): # inputs is (batch, sequence, 2) # class mappings is (assignments, vocab size) # outputs is (batch, sequence, 2, assignments) batchSize = tf.shape(inputs)[0] sequenceLength = tf.shape(inputs)[1] classes = tf.concat([tf.reshape(tf.gather(self.classMappings[i, :], inputs), (batchSize, sequenceLength, 2, 1)) for i in range(self.getAssignmentCount())], axis=3) return tf.reshape(classes, (batchSize, sequenceLength, 2, self.getAssignmentCount(), 1)) def expandClassLogitsToVocab(self, classLogits): # class logits is (batch size, sequence-length, 2, assignments, class-size) # class mappings is (class-assignments, vocab-size) # class weights is (class-assignments, vocab-size) # output is (batch-size, sequence-length, 2, vocab-size) batchSize = tf.shape(classLogits)[0] sequenceLength = tf.shape(classLogits)[1] gatheredLogits = tf.concat([tf.reshape(tf.gather(classLogits[:,:,:,i,:], self.classMappings[i, :], axis=3), (batchSize, sequenceLength, 2, 1, self.vocab.getSize())) for i in range(self.getAssignmentCount())], axis=3) return tf.reduce_mean(tf.multiply(gatheredLogits, self.classWeights), axis=3) def evaluateVocabLoss(self, classLogits, vocabLabels): # labels is (batch size, sequence-length, 2) batchSize = tf.shape(classLogits)[0] sequenceLength = tf.shape(classLogits)[1] sampleCount = self.getSoftmaxSampleCount() samples = self.generateSamples(sampleCount) sampledLabels = tf.zeros((batchSize, sequenceLength, 2), dtype=tf.int32) # sampled mappings is (assignment count, sample count) sampledMappings = self.sample(self.classMappings, samples, sampleCount) # sampled weights is (assignment count, sample count) sampledWeights = self.sample(self.classWeights, samples, sampleCount) # gathered logits is (batch size, sequence length, assignment count, sample count) gatheredLogits = tf.concat([tf.reshape(tf.gather(classLogits[:,:,:,i,:], sampledMappings[i,:], axis=3), (batchSize, sequenceLength, 2, 1, sampleCount)) for i in range(self.getAssignmentCount())], axis=3) # gathered weights is (batch size, sequence length, 2, assignment count, sample count) gatheredWeights = self.broadcastToExpandedDimension(sampledWeights, batchSize, sequenceLength) # gathered logits and weights is (batch size, sequence length, 2, assignment count, sample count + 1) gatheredLogits = self.extendLogits(gatheredLogits, classLogits, vocabLabels) gatheredWeights = self.extendWeights(gatheredWeights, vocabLabels) # weighted logits is (batch size, sequence length, 2, assignments, sample count + 1) weightedLogits = tf.multiply(gatheredLogits, gatheredWeights) # vocab logits is (batch size, sequence length, 2, sample count + 1) vocabLogits = tf.reduce_mean(weightedLogits, axis=3) return self.evaluateLoss(vocabLogits[:, 1:, :, :], sampledLabels[:, 1:, :]) def generateSamples(self, sampleCount): samplesPerAssignment = [] # TODO: BUG: Dont sample the label for assignment in range(self.getAssignmentCount()): samples, _, _ = tf.random.uniform_candidate_sampler( true_classes=tf.broadcast_to(tf.range(self.vocab.getSize(), dtype=tf.int64), (1, self.vocab.getSize())), num_true=self.vocab.getSize(), num_sampled=sampleCount, range_max=self.vocab.getSize(), unique=True) samplesPerAssignment.append(tf.reshape(samples, (1, -1))) return tf.concat(samplesPerAssignment, axis=0) def extendLogits(self, vocabLogits, classLogits, labels): # class logits is (batch size, sequence length, 2, assignment count, sample count) # map is (assignment count, vocab size) # labels is (batch size, sequence length, 2) batchSize = tf.shape(classLogits)[0] sequenceLength = tf.shape(classLogits)[1] # labelClasses is (batch size, sequence length, 2, assignment count, 1) labelClasses = tf.concat( [tf.reshape(tf.gather(self.classMappings[i, :], labels), (batchSize, sequenceLength, 2, 1, 1)) for i in range(self.getAssignmentCount())], axis=3) # gathered logits is (batch size, sequence length, 2, assignment count, 1) gatheredLogits = tf.batch_gather(classLogits, labelClasses) return tf.concat([gatheredLogits, vocabLogits], axis=4) def extendWeights(self, vocabWeights, labels): # vocab weights is (batch size, sequence length, 2, assignment count, sample count) # labels is (batch size, sequence length) batchSize = tf.shape(vocabWeights)[0] sequenceLength = tf.shape(vocabWeights)[1] # labelWeights is (batch size, sequence length, 2, assignment count, 1) labelWeights = tf.concat( [tf.reshape(tf.gather(self.classWeights[i, :], labels), (batchSize, sequenceLength, 2, 1, 1)) for i in range(self.getAssignmentCount())], axis=3) return tf.concat([labelWeights, vocabWeights], axis=4) def sample(self, mappings, samples, sampleCount): assignments = [] for i in range(self.getAssignmentCount()): assignments.append(tf.reshape(tf.gather(mappings[i, :], samples[i,:]), (1, sampleCount))) return tf.concat(assignments, axis=0) def broadcastToExpandedDimension(self, tensor, batchSize, sequenceLength): classAssignments = tensor.shape[0] vocabSize = tensor.shape[1] newShape = (batchSize, sequenceLength, 2, classAssignments, vocabSize) expandedTensor = tf.broadcast_to(tensor, newShape) #print(expandedTensor.shape) reshapedTensor = tf.reshape(expandedTensor, newShape) #print(reshapedTensor.shape) return reshapedTensor def runClassModel(self, inputs): #print("inputs", inputs.shape) inputEmbeddings = self.convertToEmbeddings(inputs) #print("inputEmbeddings", inputEmbeddings.shape) # run encoder (logits is (batch-size, sequence-length, assignments, class-count)) encodedEmbeddings = self.runEncoder(inputEmbeddings) logits = self.runDecoder(encodedEmbeddings) #print("logits", logits.shape) return logits def runClassificationModel(self): batchSize = tf.shape(self.features)[0] sequenceLength = tf.shape(self.features)[1] features = tf.reshape(self.features, (batchSize, sequenceLength, 2, self.getAssignmentCount(), self.getEmbeddingSize())) features = self.multiheadedAttention(features) # features is (batch-size, sequence-length, 2, assignments, embedding-size) reducedFeatures = tf.reduce_max(features, axis=1) # reducedFeatures is (batch size, 2, assignments, embedding-size) transposedFeatures = tf.transpose(reducedFeatures, [0,2,1,3]) # transposedFeatures is (batch size, assignments, 2, embedding-size) reshapedFeatures = tf.reshape(transposedFeatures, (-1, self.getAssignmentCount(), 2 * self.getEmbeddingSize())) return tf.layers.dense(reshapedFeatures, units=2) def runDocumentClassificationModel(self): batchSize = tf.shape(self.features)[0] sequenceLength = tf.shape(self.features)[1] features = tf.reshape(self.features, (batchSize, sequenceLength, 2, self.getAssignmentCount(), self.getEmbeddingSize())) features = self.multiheadedAttention(features) # features is (batch-size, sequence-length, 2, assignments, embedding-size) reducedFeatures = tf.reduce_max(features, axis=1) # transposedFeatures is (batch size, assignments, 2, embedding-size) reshapedFeatures = tf.reshape(reducedFeatures, (-1, 2, self.getAssignmentCount(), self.getEmbeddingSize())) return tf.layers.dense(reshapedFeatures, units=2) def convertToEmbeddings(self, sequenceIds): assignments = [] for assignment in range(self.getAssignmentCount()): assignments.append(self.convertToClassEmbeddings(sequenceIds, assignment)) return tf.concat(assignments, axis = 3) def convertToClassEmbeddings(self, ids, assignment): with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE): wordEmbeddingsGlobal = tf.get_variable('class-embeddings-' + str(assignment), \ [self.getNumberOfClasses(), self.getEmbeddingSize()]) wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal, ids[:, :, :, assignment, :]) return wordEmbeddings def runEncoder(self, embeddings): return self.multiheadedAttentionStack(embeddings) def runDecoder(self, embeddings): batchSize = tf.shape(embeddings)[0] sequenceLength = tf.shape(embeddings)[1] # embeddings is (batch size, sequence length, 2, assignments, classes) return tf.concat([tf.reshape(tf.layers.dense(embeddings[:,:,:,i,:], units=self.getNumberOfClasses()), (batchSize, sequenceLength, 2, 1, self.getNumberOfClasses())) for i in range(self.getAssignmentCount())], axis=3) def multiheadedAttentionStack(self, embeddings): embeddings = self.addPositions(embeddings) # embeddings (batch-size, sequence-length, 2, assignments, hidden-dimension) for layer in range(self.getNumberOfLayers()): embeddings = self.multiheadedAttention(embeddings) if self.isMiddleLayer(layer): batchSize = tf.shape(embeddings)[0] sequenceLength = tf.shape(embeddings)[1] self.features = tf.identity(tf.reshape(embeddings, (batchSize, sequenceLength, 2, self.getAssignmentCount() * self.getEmbeddingSize())), name="features") return embeddings def addPositions(self, embeddings): batchSize = tf.shape(embeddings)[0] sequenceLength = tf.shape(embeddings)[1] halfSequenceLength = (sequenceLength + 1) // 2 positions = tf.cast(tf.reshape(tf.range(halfSequenceLength), (1, halfSequenceLength, 1, 1, 1)), dtype=tf.float32) dimensions = tf.cast(tf.reshape(tf.range(self.getEmbeddingSize()), (1, 1, 1, 1, self.getEmbeddingSize())), dtype=tf.float32) angles = positions / tf.pow(2.0 * tf.cast(halfSequenceLength, dtype=tf.float32), 2.0 * dimensions / self.getEmbeddingSize()) evenPositionEmbeddings = tf.reshape(tf.sin(angles), (1, halfSequenceLength, 1, 1, 1, self.getEmbeddingSize())) oddPositionEmbeddings = tf.reshape(tf.cos(angles), (1, halfSequenceLength, 1, 1, 1, self.getEmbeddingSize())) # merge them positionEmbeddings = tf.concat([evenPositionEmbeddings, oddPositionEmbeddings], axis=2) positionEmbeddings = tf.reshape(positionEmbeddings, (1, 2 * halfSequenceLength, 1, 1, self.getEmbeddingSize())) positionEmbeddings = positionEmbeddings[:, 0:sequenceLength, :, :, :] return embeddings + positionEmbeddings def isMiddleLayer(self, layer): if self.getNumberOfLayers() > 1: return layer == (self.getNumberOfLayers() - 2) return layer == (self.getNumberOfLayers() - 1) def multiheadedAttention(self, embeddings): # embeddings (batch-size, sequence-length, assignments, hidden-dimension) projectedEmbeddings = self.projectEmbeddings(embeddings) # proj-embeddings (batch-size, sequence-length, assignments, QKV, attention-heads, hidden-dimension) attentionOutput = self.runAttention(projectedEmbeddings) # project back outputEmbeddings = self.projectBackEmbeddings(attentionOutput) # add and norm embeddings = self.addAndNorm(outputEmbeddings, embeddings) # dense layer denseOutput = tf.layers.dense(embeddings, self.getEmbeddingSize(), activation="relu") # add and norm denseOutput = self.addAndNorm(denseOutput, embeddings) return denseOutput def projectEmbeddings(self, embeddings): output = tf.layers.dense(embeddings, embeddings.shape[-1] * 3 * self.getNumberOfAttentionHeads()) batchSize = tf.shape(embeddings)[0] sequenceLength = tf.shape(embeddings)[1] assignments = embeddings.shape[3] return tf.reshape(output, (batchSize, sequenceLength, 2, assignments, 3, self.getNumberOfAttentionHeads(), embeddings.shape[-1])) def projectBackEmbeddings(self, embeddings): # embeddings are (batch-size, sequence-length, 2, assignments, attention-heads, embedding-size) # project to (batch-size, sequece-length, 2, assignments, embedding-size) batchSize = tf.shape(embeddings)[0] sequenceLength = tf.shape(embeddings)[1] assignments = embeddings.shape[3] reshapedEmbeddings = tf.reshape(embeddings, (batchSize, sequenceLength, 2, assignments, embeddings.shape[-1] * embeddings.shape[-2])) projectedEmbeddings = tf.layers.dense(reshapedEmbeddings, self.getEmbeddingSize()) return projectedEmbeddings def addAndNorm(self, left, right): return tf.contrib.layers.layer_norm(tf.add(left, right)) def runAttention(self, embeddings): # Q,K,V (batch-size, sequence-length, 2, assignments, attention-heads, hidden-dimension) Q = embeddings[:,:,:,:,0,:,:] K = embeddings[:,:,:,:,1,:,:] V = embeddings[:,:,:,:,2,:,:] readOn = tf.matmul(Q, K, transpose_b=True) scale = math.sqrt(self.getEmbeddingSize()) scaledReadOn = readOn / scale contribution = tf.nn.softmax(scaledReadOn, axis=1) result = tf.matmul(contribution, V) return result def checkpoint(self, prefix): """Creates a checkpoint of the current model and saves to model directory. """ self.checkpointer.setPrefix(prefix) directory = self.checkpointer.getModelSaveDirectory() logger.debug("Saving checkpoint to: " + str(directory)) self.checkpointer.checkpoint() with self.graph.as_default(): tf.saved_model.simple_save(self.session, directory, inputs={"input_text" : self.inputTokens}, outputs={"outputs" : self.outputDocumentClass}) self.checkpointer.cleanup() """Functions to load configuration parameters.""" def getEmbeddingSize(self): return int(self.config["model"]["embedding-size"]) def getAssignmentCount(self): return int(self.config["model"]["assignment-count"]) def getSoftmaxSampleCount(self): return int(self.config["model"]["softmax-sample-count"]) def getNumberOfClasses(self): return int(self.config["model"]["number-of-classes"]) def getNumberOfDirectClasses(self): return int(self.config["model"]["number-of-direct-classes"]) def getNumberOfLayers(self): return int(self.config["model"]["number-of-layers"]) def getNumberOfAttentionHeads(self): return int(self.config["model"]["number-of-attention-heads"]) def getWordFrequencyPowerLawExponent(self): return float(self.config["model"]["word-frequency-power-law-exponent"]) def shouldRunValidation(self): return self.config["model"]["run-validation"] def getEpochs(self): return int(self.config["model"]["epochs"]) def getShouldCreateModel(self): if not "create-new-model" in self.config["model"]: return False return bool(self.config["model"]["create-new-model"]) def getShouldClassifyDocument(self): if not "classify-document" in self.config["model"]: return False return bool(self.config["model"]["classify-document"]) def getStepsPerEpoch(self): return int(self.config["model"]["steps-per-epoch"]) def getStepsPerTensorboardLog(self): return int(self.config["model"]["steps-per-tensorboard-log"]) def getValidationStepsPerEpoch(self): return int(self.config["model"]["validation-steps-per-epoch"]) def getExperimentDirectory(self): return self.config["model"]["directory"]
import sys sys.path.append('source') from models.Vocab import Vocab from sklearn.cluster import MiniBatchKMeans as MiniBatchKMeans import os import numpy directory = 'output-features-16k-classes-2-layers-200MB-3' numberOfClusters = 16 vocab = Vocab({"model": {"vocab": os.path.join(directory, 'vocab.txt')}}) embeddings = numpy.load(os.path.join(directory, 'features.npy')) inputs = numpy.load(os.path.join(directory, 'inputs.npy')) labels = numpy.load(os.path.join(directory, 'labels.npy')) chunkCount = embeddings.shape[0] chunkLength = embeddings.shape[1] clusters = numpy.reshape( KMeans(n_clusters=numberOfClusters).fit_predict( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))), (chunkCount, chunkLength)) clusterMap = {i: [] for i in range(numberOfClusters)} for chunk in range(chunkCount): chunkString = [ vocab.getTokenString(labels[chunk, word])
def __init__(self, config): self.config = config self.vocab = Vocab(config)
def rewriteSplitTokens(self, inputs, labels, predictions): from functools import reduce newInputs = [] newPredictions = [] newVocabProbabilities = [] batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] # collapse expanded tokens for batch in range(batchSize): inputString = "".join([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ]) reservedIndices = set([ index for index, token in enumerate(labels[batch, :]) if Vocab.isReservedToken(token) ]) tokenizer = UnlimitedVocabTokenizerAdaptor( StringDataSource(inputString)) completeTokens = [ tokenizer.next() for i in range(tokenizer.size()) ] logger.debug("Reformed input string: '" + str([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ])) logger.debug("' tokenized to: " + str(completeTokens)) logger.debug( " tokens: " + str([self.vocab.getToken(token) for token in completeTokens])) index = 0 completeTokenIndex = 0 newBatchInputs = [] newBatchPredictions = [] newBatchVocabProbabilities = [] while index < sequenceLength: token = labels[batch, index] completeToken = completeTokens[completeTokenIndex] # get token end tokenEndIndex = index + 1 if self.vocab.getToken( completeToken ) != token and not index in reservedIndices: while tokenEndIndex < sequenceLength: possibleToken = labels[batch, tokenEndIndex] if (completeTokenIndex + 1) < len(completeTokens): if self.vocab.getToken( completeTokens[completeTokenIndex + 1]) == possibleToken: break tokenEndIndex += 1 # add token newBatchInputs.append([index, tokenEndIndex]) newBatchVocabProbabilities.append( list(predictions[batch, index, :])) newBatchVocabProbabilities[-1][0] = 0.0 # compute new probabilities for the merged token predictionValues = predictions[batch, index:tokenEndIndex, 0] newBatchPredictions.append( reduce(lambda x, y: x * y, predictionValues)) if tokenEndIndex > (index + 1): logger.debug("Reformed split tokens: " + str([ self.vocab.getTokenString(token) for token in labels[batch, index:tokenEndIndex] ]) + (" with prob: %.4f" % newBatchPredictions[-1])) if not index in reservedIndices: completeTokenIndex += 1 index = tokenEndIndex newInputs.append(newBatchInputs) newPredictions.append(newBatchPredictions) newVocabProbabilities.append(newBatchVocabProbabilities) # pad maxLength = max([len(tokens) for tokens in newInputs]) newInputs = [ inputs + [self.getPadToken() for i in range(maxLength - len(inputs))] for inputs in newInputs ] newPredictions = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newPredictions ] newVocabProbabilties = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newVocabProbabilities ] return numpy.array(newInputs), numpy.array( newPredictions), numpy.array(newVocabProbabilities)
def runLocally(arguments): import numpy numpy.set_printoptions(precision=3, linewidth=150) device = getDevice() with tf.device(device): for scope in arguments["enable_logger"]: logger = logging.getLogger(scope) logger.setLevel(logging.DEBUG) config = loadConfig(arguments) overrideConfig(config, arguments) if arguments["predict"]: if not "predictor" in config: config["predictor"] = {} validationData = getValidationData(config) predictor = getPredictor(config, validationData) perplexity = predictor.predict() print("Perplexity " + str(perplexity)) elif arguments["make_clusters"]: validationData = getValidationData(config) clusterer = getClusterer(config, validationData, arguments["output_directory"], int(arguments["cluster_count"])) clusterer.groupDataIntoClusters() elif arguments["make_test_set"]: validationData = getValidationData(config) if int(arguments["test_set_size"]) > 0: saveData(validationData, int(arguments["test_set_size"]), arguments["output_directory"], Vocab(config)) else: assert int(arguments["test_set_size_bytes"]) > 0 saveDataBytes(validationData, int(arguments["test_set_size_bytes"]), arguments["output_directory"], Vocab(config)) elif arguments["make_vocab"]: validationData = getValidationData(config) saveVocab(validationData, int(arguments["vocab_size"]), arguments["output_directory"]) else: config["model"]["directory"] = nameDirectory( arguments["experiment_name"]) makeExperiment(config) trainingData = getTrainingData(config) validationData = getValidationData(config) model = getModel(config, trainingData, validationData) model.train()
def loadVocab(self): return Vocab(self.config)
class LinearModel: def __init__(self, config, trainingDataSource, validationDataSource): """Initializes the linear model object. Attributes: config: The configuration for the model. trainingDataSource: list of training samples and labels validationDataSource: list of validation samples and labels """ self.config = config self.trainingDataSource = trainingDataSource self.validationDataSource = validationDataSource self.graph = tf.Graph() self.session = tf.Session(graph=self.graph) self.checkpointer = ModelDescriptionCheckpointer(config, "LinearModel") self.isLoaded = False def train(self): """Trains the linear model. Trains the model for epochs specified in the config. Runs the validation dataset on the model if specified in the config. """ with self.graph.as_default(): self.getOrLoadModel() for epoch in range(self.getEpochs()): self.runOnTrainingDataset(epoch) if self.shouldRunValidation(): self.runOnValidationDataset(epoch) self.checkpoint() def predict(self, inputs, requestedPredictions): with self.graph.as_default(): self.getOrLoadModel() inputs = numpy.array(inputs) predictions = self.session.run(self.outputProbabilities, feed_dict={self.inputTokens: inputs}) batchSize = requestedPredictions.shape[0] length = requestedPredictions.shape[1] outputPredictions = numpy.zeros(requestedPredictions.shape) for b in range(batchSize): for l in range(length): outputPredictions[b,l,:] = \ predictions[b,l,requestedPredictions[b,l,:]] return outputPredictions def getOrLoadModel(self): """Returns a linear model. If specified, create a new model else load an already existing model. """ self.vocab = Vocab(self.config) shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory( )) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.loadModel() def loadModel(self): """Loads an already existing model from the specified path """ if self.isLoaded: return self.checkpointer.load() directory = self.checkpointer.getModelDirectory() logger.debug("Loading checkpoint from: " + str(directory)) tf.saved_model.loader.load(self.session, ["serve"], directory) self.setOperationsByName() self.isLoaded = True def setOperationsByName(self): self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0") self.labels = self.graph.get_tensor_by_name("output-labels:0") self.outputProbabilities = \ self.graph.get_tensor_by_name("output-probabilities:0") self.loss = self.graph.get_tensor_by_name("loss:0") self.optimizerStep = self.graph.get_operation_by_name("optimizer-step") def createModel(self): ## (batch, sequence-length, 1) self.inputTokens = tf.placeholder(tf.int32, shape=(None, None), name="input-tokens") self.labels = tf.placeholder(tf.int32, shape=(None, None), name="output-labels") predictedLogits = self.processInputMiniBatch(self.inputTokens) self.loss = self.evaluateLoss(predictedLogits, self.labels) self.outputProbabilities = tf.nn.softmax(predictedLogits, name="output-probabilities") # optimizer self.optimizerStep = self.createOptimizerStep(self.loss) # initializers self.globalInitializer = tf.global_variables_initializer() self.localInitializer = tf.local_variables_initializer() # summaries self.setupSummaries() # do the initialization self.initializeModel() def initializeModel(self): self.session.run(self.globalInitializer) self.session.run(self.localInitializer) def runOnTrainingDataset(self, epoch): """Trains the linear model on the training dataset for one epoch.""" trainStart = time.time() for step in range(self.getStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.trainingDataSource.next() generatorEnd = time.time() trainStepStart = time.time() loss, gradNorm = self.trainingStep(inputs, labels, step) trainStepEnd = time.time() message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) + "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", training step time: " + ("%.2f" % (trainStepEnd - trainStepStart) + ", loss: " + str("%.2f" % loss) + ", grad norm: " + str("%.2f" % gradNorm))) print(message, end="\r", flush=True) trainEnd = time.time() print(message) logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...") def trainingStep(self, inputs, labels, step): """Training step for one minibatch of training data.""" inputs = numpy.array(inputs) labels = numpy.array(labels) trainingLoss, gradNorm, summaries, _ = self.session.run( [ self.loss, self.gradientNorm, self.mergedSummary, self.optimizerStep ], feed_dict={ self.inputTokens: inputs, self.labels: labels }) self.trainingSummaryWriter.add_summary(summaries, step) return trainingLoss, gradNorm def runOnValidationDataset(self, epoch): """Runs the linear model on the validation dataset for one epoch.""" validationStart = time.time() for step in range(self.getValidationStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.validationDataSource.next() generatorEnd = time.time() validationStepStart = time.time() loss, summary = self.validationStep(inputs, labels) validationStepEnd = time.time() message = ("Validation Step (" + str(step) + " / " + str(self.getValidationStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", validation step time: " + ("%.2f" % (validationStepEnd - validationStepStart) + ", loss: " + str(loss))) print(message, end="\r", flush=True) validationEnd = time.time() print(message) logger.debug(" Validation took: " + (str(validationEnd - validationStart)) + " seconds...") def validationStep(self, inputs, labels): """One minibatch of validation data processed by the model.""" inputs = numpy.array(inputs) labels = numpy.array(labels) validationLoss, summaries = self.session.run( [self.loss, self.mergedSummary], feed_dict={ self.inputTokens: inputs, self.labels: labels }) return validationLoss, summaries def createOptimizerStep(self, loss): """One step of backprop.""" optimizer = tf.train.AdamOptimizer(learning_rate=float( self.config["model"]["learningRate"]), beta1=0.9, beta2=0.999, epsilon=numpy.finfo(float).eps, name="optimizer-step") gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm( gradients, self.config["model"]["gradientClippingFactor"]) self.gradientNorm = tf.global_norm(gradients, name="gradient-norm") return optimizer.apply_gradients(zip(gradients, variables)) def setupSummaries(self): tf.summary.scalar('cross-entropy', self.loss) tf.summary.scalar('gradient-norm', self.gradientNorm) self.mergedSummary = tf.summary.merge_all() self.trainingSummaryWriter = tf.summary.FileWriter( os.path.join(self.getExperimentDirectory(), 'training-summaries'), self.graph) if self.shouldRunValidation(): self.validationSummaryWriter = tf.summary.FileWriter( os.path.join(self.getExperimentDirectory(), 'validation-summaries'), self.graph) def evaluateLoss(self, batchOutputs, labels): return tf.identity(tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=batchOutputs), name="loss") def processInputMiniBatch(self, inputs): return self.runEncoderDecoder(inputs, inputs) def runEncoderDecoder(self, inputSequence, historicSequence): # convert sequences to embeddings (output embeddings are Tensor(batch-size, sequence-length, hidden)) inputEmbeddings = self.convertToEmbeddings(inputSequence) historicEmbeddings = self.convertToEmbeddings(historicSequence) # run encoder (encodedEmbeddings is (batch-size, sequence-length, hidden)) encodedEmbeddings = self.runEncoder(inputEmbeddings) # run decoder (logits is Tensor(batch-size, sequence-length, vocab-size) logits = self.runDecoder(encodedEmbeddings, historicEmbeddings) return logits def convertToEmbeddings(self, sequenceIds): with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE): wordEmbeddingsGlobal = tf.get_variable('word-embeddings', \ [self.vocab.getSize(), self.getEmbeddingSize()]) wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal, sequenceIds) return wordEmbeddings def runEncoder(self, embeddings): return tf.layers.dense(embeddings, self.getEmbeddingSize(), activation="relu") def runDecoder(self, inputEmbeddings, historicEmbeddings): return tf.layers.dense( tf.concat([inputEmbeddings, historicEmbeddings], axis=2), self.vocab.getSize()) def checkpoint(self): """Creates a checkpoint of current model and saves to model directory. """ directory = self.checkpointer.getModelDirectory() logger.debug("Saving checkpoint to: " + str(directory)) self.checkpointer.checkpoint() exists = os.path.exists(directory) if exists: tempDirectory = directory + "-temp" shutil.move(directory, tempDirectory) with self.graph.as_default(): tf.saved_model.simple_save( self.session, directory, inputs={"input-tokens": self.inputTokens}, outputs={"output-probabilities": self.outputProbabilities}) if exists: shutil.rmtree(tempDirectory) """Functions to load configuration parameters.""" def getEmbeddingSize(self): return int(self.config["model"]["embeddingSize"]) def shouldRunValidation(self): return self.config["model"]["runValidation"] def getEpochs(self): return int(self.config["model"]["epochs"]) def getShouldCreateModel(self): if not "createNewModel" in self.config["model"]: return False return bool(self.config["model"]["createNewModel"]) def getStepsPerEpoch(self): return int(self.config["model"]["stepsPerEpoch"]) def getValidationStepsPerEpoch(self): return int(self.config["model"]["validationStepsPerEpoch"]) def getExperimentDirectory(self): return self.config["model"]["directory"]
class LabelAdaptor: def __init__(self, config, source): self.config = config self.source = source self.secondSource = source.clone() self.secondSource.shuffleDocuments() self.random = numpy.random.RandomState(seed=self.getSeed()) self.vocab = Vocab(config) def next(self): chunk = self.source.next() isFromSameSource = self.random.binomial(1, 0.5) if isFromSameSource: secondChunk = self.source.next() else: secondChunk = self.secondSource.next() chunk, documentId = zip(*chunk) secondChunk, secondDocumentId = zip(*secondChunk) labels = self.addTokenLabels(chunk, documentId) inputs = self.maskOffTokens(labels) secondLabels = self.addTokenLabels(secondChunk, secondDocumentId) secondInputs = self.maskOffTokens(secondLabels) return inputs, labels, secondInputs, secondLabels def addTokenLabels(self, chunk, documentIds): return [documentIds[0]] + list(chunk) def maskOffTokens(self, labels): inputs = list(labels) for i in range(1, len(labels)): if self.random.binomial(1, 0.15): if self.random.binomial(1, 0.8): inputs[i] = Vocab.getMaskToken() else: if self.random.binomial(1, 0.5): inputs[i] = self.random.randint(Vocab.getVocabOffset(), self.vocab.getSize()) inputs[0] = Vocab.getClassLabelToken() return inputs def getSeed(self): if not "size" in self.config["adaptor"]["labels"]: return 122 return int(self.config["adaptor"]["labels"]["seed"]) def reset(self): self.random = numpy.random.RandomState(seed=self.getSeed()) self.source.reset() self.secondSource.reset() self.secondSource.shuffleDocuments() def size(self): return self.source.size() def setMaximumSize(self, size): self.source.setMaximumSize(size)
class FallbackTokenEvaluator: def __init__(self, config): self.config = config self.vocab = Vocab(config) def initialize(self): self.perplexityStates = self.createPerplexityStates( self.getBatchSize()) def evaluate(self, inputs, labels, predictions): inputIndices, predictions, vocabProbabilities = self.rewriteSplitTokens( inputs, labels, predictions) self.recordPredictions(predictions, vocabProbabilities, inputIndices, inputs) def getRequestedPredictions(self, inputs, labels): return numpy.expand_dims(labels, axis=2) def finalize(self): return self.getPerplexity() def getBatchSize(self): if not "adaptor" in self.config: return 1 if not "batching" in self.config["adaptor"]: return 1 if not "size" in self.config["adaptor"]["batching"]: return 1 return int(self.config["adaptor"]["batching"]["size"]) def createPerplexityStates(self, count): return [PerplexityState(self.vocab) for i in range(count)] def getPerplexity(self): byteCount = sum( [state.getByteCount() for state in self.perplexityStates]) totalEntropy = sum( [state.getEntropy() for state in self.perplexityStates]) return 2.0**(totalEntropy / byteCount) def recordPredictions(self, predictions, vocabProbabilities, inputIndices, inputs): # predictions is Tensor(batch-size, sequence-length, vocab-size) # inputs is Tensor(batch-size, sequence-length) batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] # TODO: replace with something like batch gather for batch in range(batchSize): for element in range(sequenceLength): labelPrediction = predictions[batch, element] self.perplexityStates[batch].addPrediction( inputs[batch, :], inputIndices[batch, element], labelPrediction, vocabProbabilities[batch, element, :]) def rewriteSplitTokens(self, inputs, labels, predictions): from functools import reduce newInputs = [] newPredictions = [] newVocabProbabilities = [] batchSize = predictions.shape[0] sequenceLength = predictions.shape[1] # collapse expanded tokens for batch in range(batchSize): inputString = "".join([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ]) reservedIndices = set([ index for index, token in enumerate(labels[batch, :]) if Vocab.isReservedToken(token) ]) tokenizer = UnlimitedVocabTokenizerAdaptor( StringDataSource(inputString)) completeTokens = [ tokenizer.next() for i in range(tokenizer.size()) ] logger.debug("Reformed input string: '" + str([ self.vocab.getTokenString(token) for token in labels[batch, :] if not Vocab.isReservedToken(token) ])) logger.debug("' tokenized to: " + str(completeTokens)) logger.debug( " tokens: " + str([self.vocab.getToken(token) for token in completeTokens])) index = 0 completeTokenIndex = 0 newBatchInputs = [] newBatchPredictions = [] newBatchVocabProbabilities = [] while index < sequenceLength: token = labels[batch, index] completeToken = completeTokens[completeTokenIndex] # get token end tokenEndIndex = index + 1 if self.vocab.getToken( completeToken ) != token and not index in reservedIndices: while tokenEndIndex < sequenceLength: possibleToken = labels[batch, tokenEndIndex] if (completeTokenIndex + 1) < len(completeTokens): if self.vocab.getToken( completeTokens[completeTokenIndex + 1]) == possibleToken: break tokenEndIndex += 1 # add token newBatchInputs.append([index, tokenEndIndex]) newBatchVocabProbabilities.append( list(predictions[batch, index, :])) newBatchVocabProbabilities[-1][0] = 0.0 # compute new probabilities for the merged token predictionValues = predictions[batch, index:tokenEndIndex, 0] newBatchPredictions.append( reduce(lambda x, y: x * y, predictionValues)) if tokenEndIndex > (index + 1): logger.debug("Reformed split tokens: " + str([ self.vocab.getTokenString(token) for token in labels[batch, index:tokenEndIndex] ]) + (" with prob: %.4f" % newBatchPredictions[-1])) if not index in reservedIndices: completeTokenIndex += 1 index = tokenEndIndex newInputs.append(newBatchInputs) newPredictions.append(newBatchPredictions) newVocabProbabilities.append(newBatchVocabProbabilities) # pad maxLength = max([len(tokens) for tokens in newInputs]) newInputs = [ inputs + [self.getPadToken() for i in range(maxLength - len(inputs))] for inputs in newInputs ] newPredictions = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newPredictions ] newVocabProbabilties = [ predictions + [0.0 for i in range(maxLength - len(predictions))] for predictions in newVocabProbabilities ] return numpy.array(newInputs), numpy.array( newPredictions), numpy.array(newVocabProbabilities)
class UnigramModel: def __init__(self, config, trainingData, validationData): self.config = config self.trainingData = trainingData self.validationData = validationData self.checkpointer = ModelDescriptionCheckpointer(config, "UnigramModel") if not self.trainingData is None: self.trainingData.setMaximumSize(int(self.config["model"]["stepsPerEpoch"])) if not self.validationData is None: self.validationData.setMaximumSize(int(self.config["model"]["validationStepsPerEpoch"])) self.getOrLoadModel() def train(self): for epoch in range(self.getEpochs()): self.trainingData.reset() self.runOnTrainingDataset(epoch) if self.shouldRunValidation(): self.trainingData.reset() self.runOnValidationDataset(epoch) self.checkpoint() def runOnTrainingDataset(self, epoch): import time trainStart = time.time() for step in range(self.getStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.trainingData.next() generatorEnd = time.time() trainStepStart = time.time() self.trainingStep(inputs, labels) trainStepEnd = time.time() message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) + "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", training step time: " + ("%.2f" % (trainStepEnd - trainStepStart))) print(message, end="\r", flush=True) trainEnd = time.time() print(message) logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...") def trainingStep(self, inputs, labels): # just consider the labels for batch in range(labels.shape[0]): self.totalTokens += labels.shape[1] for token in range(labels.shape[1]): self.tokenCounts[labels[batch, token]] += 1 def runOnValidationDataset(self, epoch): import time start = time.time() totalCrossEntropy = 0.0 totalBytes = 0 for step in range(self.getValidationStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.validationData.next() generatorEnd = time.time() stepStart = time.time() crossEntropy, byteCount = self.validationStep(inputs, labels) stepEnd = time.time() message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) + "), Step (" + str(step) + " / " + str(self.getValidationStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", validation step time: " + ("%.2f" % (stepEnd - stepStart)) + ", loss is " + str(crossEntropy/tokens)) print(message, end="\r", flush=True) totalCrossEntropy += crossEntropy totalBytes += byteCount end = time.time() print(message) logger.debug(" Validation took: " + (str(end - start)) + " seconds... cross entropy is " + str(totalCrossEntropy/totalBytes)) def validationStep(self, inputs, labels): import math crossEntropy = 0.0 byteCount = 0 for batch in range(labels.shape[0]): for index in range(labels.shape[1]): token = labels[batch, token] tokenProbability = self.getTokenProbability(token) crossEntropy += -math.log(tokenProbability) byteCount = self.vocab.getTokenBytes() return crossEntropy, byteCount def getTokenProbability(self, token): count = self.tokenCounts[token] # TODO: Implement kneser ney smoothing return (count + 1.0) / (self.totalTokens + 1.0) def getOrLoadModel(self): import os self.vocab = Vocab(self.config) shouldCreate = not os.path.exists( self.checkpointer.getModelDirectory()) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.load() def createModel(self): self.tokenCounts = numpy.zeros(self.vocab.getSize()) self.totalTokens = 0 def checkpoint(self): import json import os import shutil directory = self.checkpointer.getModelDirectory() logger.debug("Saving checkpoint to: " + str(directory)) self.checkpointer.checkpoint() exists = os.path.exists(directory) if exists: tempDirectory = directory + "-temp" shutil.move(directory, tempDirectory) os.makedirs(directory) with open(os.path.join(directory, "unigram-statistics.json"), "w") as jsonFile: json.dump([self.totalTokens, [i for i in self.tokenCounts]], jsonFile) if exists: shutil.rmtree(tempDirectory) def predict(self, inputs): batchSize = inputs.shape[0] length = inputs.shape[1] vocab = self.getVocab().getSize() probs = [self.getTokenProbability(token) for token in range(vocab)] return numpy.broadcast_to(numpy.array(probs), [batchSize, length, vocab]) def load(self): import os import json self.checkpointer.load() directory = self.checkpointer.getModelDirectory() logger.debug("Loading checkpoint from: " + str(directory)) with open(os.path.join(directory, "unigram-statistics.json"), "r") as jsonFile: self.totalTokens, counts = json.load(jsonFile) self.tokenCounts = numpy.array(counts) def getVocab(self): return self.vocab def getEpochs(self): return int(self.config["model"]["epochs"]) def getShouldCreateModel(self): if not "createNewModel" in self.config["model"]: return False return bool(self.config["model"]["createNewModel"]) def getStepsPerEpoch(self): return min(int(self.config["model"]["stepsPerEpoch"]), self.trainingData.size()) def getValidationStepsPerEpoch(self): return min(int(self.config["model"]["validationStepsPerEpoch"]), self.validationData.size()) def shouldRunValidation(self): if not "runValidation" in self.config["model"]: return True return bool(self.config["model"]["runValidation"])
def isPredictedToken(self, token): return token == Vocab.getMaskToken() or token == Vocab.getVocabOffset()
def groupDataIntoClusters(self): kmeans = MiniBatchKMeans(n_clusters=self.numberOfClusters) featurizer = Featurizer(self.config, self.validationDataset) vocab = Vocab(self.config) if self.usePCA(): pca = IncrementalPCA(n_components=32) logger.info("Reducing dimensionality...") # fit the pca model if self.usePCA(): for iteration in range(self.getIterations()): if iteration % 10 == 0: logger.info(" " + str(iteration) + " / " + str(self.getIterations())) inputs, labels, embeddings = featurizer.featurizeOneBatch() pca.partial_fit( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) self.validationDataset.reset() logger.info("Fitting model...") # fit the kmeans model for iteration in range(self.getIterations()): if iteration % 10 == 0: inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch( reportTime=True) logger.info(" " + str(iteration) + " / " + str(self.getIterations()) + " data load time: " + str(dataTime) + " model eval time: " + str(modelTime)) else: inputs, labels, embeddings = featurizer.featurizeOneBatch() if self.usePCA(): embeddings = pca.transform( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) kmeans.partial_fit( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) self.validationDataset.reset() # group into clusters # create a histogram of word frequencies per cluster clusterHistogram = {i: {} for i in range(self.numberOfClusters)} clusterWins = {i: 0 for i in range(self.numberOfClusters)} documentMap = {} logger.info("Clustering data...") for iteration in range(self.getIterations()): if iteration % 10 == 0: inputs, labels, embeddings, dataTime, modelTime = featurizer.featurizeOneBatch( reportTime=True) logger.info(" " + str(iteration) + " / " + str(self.getIterations()) + " data load time: " + str(dataTime) + " model eval time: " + str(modelTime)) else: inputs, labels, embeddings = featurizer.featurizeOneBatch() chunkLength = embeddings.shape[1] batchSize = embeddings.shape[0] if self.usePCA(): embeddings = pca.transform( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))) clusters = numpy.reshape( kmeans.predict( numpy.reshape(embeddings, (-1, embeddings.shape[-1]))), (batchSize, chunkLength)) for batch in range(batchSize): documentId = labels[batch, 0] if not documentId in documentMap: documentMap[documentId] = [] clusterIds = [] for wordIndex in range(1, chunkLength): word = vocab.getTokenString(labels[batch, wordIndex]) cluster = clusters[batch, wordIndex] clusterIds.append(cluster) if not labels[batch, wordIndex] in clusterHistogram[cluster]: clusterHistogram[cluster][labels[batch, wordIndex]] = 0 clusterHistogram[cluster][labels[batch, wordIndex]] += 1 clusterWins[cluster] += 1 documentMap[documentId].extend(clusterIds) if not os.path.exists(self.outputDirectory): os.makedirs(self.outputDirectory) # write histograms with open(self.getOutputHistogramFileName(), "w") as log: for clusterId, clusterCount in sorted(clusterWins.items(), key=lambda x: x[1], reverse=True): words = clusterHistogram[clusterId] log.write("Cluster, " + str(clusterId) + " (" + str(clusterCount) + ")\n") for wordIndex, count in sorted(words.items(), key=lambda x: x[1], reverse=True): log.write(" '" + vocab.getTokenString(wordIndex) + "' " + str(count) + "\n") # write document clusters for documentId, clusters in documentMap.items(): histogram = {} for cluster in clusters: if not cluster in histogram: histogram[cluster] = 0 histogram[cluster] += 1 with open(self.getOutputDocumentClusterFileName(documentId), "w") as log: for cluster, count in sorted(histogram.items(), key=lambda x: x[1], reverse=True): words = clusterHistogram[cluster] topWord = vocab.getTokenString( sorted(words.items(), key=lambda x: x[1], reverse=True)[0][0]) log.write("Cluster, " + str(cluster) + ", " + topWord + ", " + str(count) + "\n")
class BERTModel: def __init__(self, config, trainingDataSource, validationDataSource): """Initializes the linear model object. Attributes: config: The configuration for the model. trainingDataSource: list of training samples and labels validationDataSource: list of validation samples and labels """ self.config = config self.trainingDataSource = trainingDataSource self.validationDataSource = validationDataSource self.graph = tf.Graph() self.session = tf.Session(graph=self.graph) self.checkpointer = ModelDescriptionCheckpointer(config, "BERTModel") self.isLoaded = False def train(self): """Trains the linear model. Trains the model for epochs specified in the config. Runs the validation dataset on the model if specified in the config. """ with self.graph.as_default(): self.getOrLoadModel() for epoch in range(self.getEpochs()): self.runOnTrainingDataset(epoch) if self.shouldRunValidation(): self.runOnValidationDataset(epoch) self.checkpoint() def predict(self, inputs, requestedPredictions): with self.graph.as_default(): self.getOrLoadModel() inputs = numpy.array(inputs) predictions = self.session.run(self.outputProbabilities, feed_dict={self.inputTokens: inputs}) batchSize = requestedPredictions.shape[0] length = requestedPredictions.shape[1] outputPredictions = numpy.zeros(requestedPredictions.shape) for b in range(batchSize): for l in range(length): outputPredictions[b,l,:] = \ predictions[b,l,requestedPredictions[b,l,:]] return outputPredictions def getOrLoadModel(self): """Returns a linear model. If specified, create a new model else load an already existing model. """ self.vocab = Vocab(self.config) shouldCreate = not os.path.exists(self.checkpointer.getModelDirectory( )) or self.getShouldCreateModel() if shouldCreate: self.createModel() else: self.loadModel() def loadModel(self): """Loads an already existing model from the specified path """ if self.isLoaded: return self.checkpointer.load() directory = self.checkpointer.getModelDirectory() logger.debug("Loading checkpoint from: " + str(directory)) tf.saved_model.loader.load(self.session, ["serve"], directory) self.setOperationsByName() self.isLoaded = True def setOperationsByName(self): self.inputTokens = self.graph.get_tensor_by_name("input-tokens:0") self.labels = self.graph.get_tensor_by_name("output-labels:0") self.outputProbabilities = \ self.graph.get_tensor_by_name("output-probabilities:0") self.loss = self.graph.get_tensor_by_name("loss:0") self.optimizerStep = self.graph.get_operation_by_name("optimizer-step") def createModel(self): ## (batch, sequence-length, 1) self.inputTokens = tf.placeholder(tf.int32, shape=(None, None), name="input-tokens") self.labels = tf.placeholder(tf.int32, shape=(None, None), name="output-labels") predictedLogits = self.processInputMiniBatch(self.inputTokens) self.loss = self.evaluateLoss(predictedLogits, self.labels) self.outputProbabilities = tf.nn.softmax(predictedLogits, name="output-probabilities") # optimizer self.optimizerStep = self.createOptimizerStep(self.loss) # initializers self.globalInitializer = tf.global_variables_initializer() self.localInitializer = tf.local_variables_initializer() # summaries self.setupSummaries() # do the initialization self.initializeModel() def initializeModel(self): self.session.run(self.globalInitializer) self.session.run(self.localInitializer) def runOnTrainingDataset(self, epoch): """Trains the linear model on the training dataset for one epoch.""" trainStart = time.time() for step in range(self.getStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.trainingDataSource.next() generatorEnd = time.time() trainStepStart = time.time() loss, gradNorm = self.trainingStep(inputs, labels, step) trainStepEnd = time.time() message = ("Epoch (" + str(epoch) + " / " + str(self.getEpochs()) + "), Step (" + str(step) + " / " + str(self.getStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", training step time: " + ("%.2f" % (trainStepEnd - trainStepStart) + ", loss: " + str("%.2f" % loss) + ", grad norm: " + str("%.2f" % gradNorm))) print(message, end="\r", flush=True) trainEnd = time.time() print(message) logger.debug(" Training took: " + (str(trainEnd - trainStart)) + " seconds...") def trainingStep(self, inputs, labels, step): """Training step for one minibatch of training data.""" inputs = numpy.array(inputs) labels = numpy.array(labels) trainingLoss, gradNorm, summaries, _ = self.session.run( [ self.loss, self.gradientNorm, self.mergedSummary, self.optimizerStep ], feed_dict={ self.inputTokens: inputs, self.labels: labels }) self.trainingSummaryWriter.add_summary(summaries, step) return trainingLoss, gradNorm def runOnValidationDataset(self, epoch): """Runs the linear model on the validation dataset for one epoch.""" validationStart = time.time() for step in range(self.getValidationStepsPerEpoch()): generatorStart = time.time() inputs, labels = self.validationDataSource.next() generatorEnd = time.time() validationStepStart = time.time() loss, summary = self.validationStep(inputs, labels) validationStepEnd = time.time() message = ("Validation Step (" + str(step) + " / " + str(self.getValidationStepsPerEpoch()) + "), Generator time: " + ("%.2f" % (generatorEnd - generatorStart)) + ", validation step time: " + ("%.2f" % (validationStepEnd - validationStepStart) + ", loss: " + str(loss))) print(message, end="\r", flush=True) validationEnd = time.time() print(message) logger.debug(" Validation took: " + (str(validationEnd - validationStart)) + " seconds...") def validationStep(self, inputs, labels): """One minibatch of validation data processed by the model.""" inputs = numpy.array(inputs) labels = numpy.array(labels) validationLoss, summaries = self.session.run( [self.loss, self.mergedSummary], feed_dict={ self.inputTokens: inputs, self.labels: labels }) return validationLoss, summaries def createOptimizerStep(self, loss): """One step of backprop.""" optimizer = tf.train.AdamOptimizer(learning_rate=float( self.config["model"]["learningRate"]), beta1=0.9, beta2=0.999, epsilon=numpy.finfo(float).eps, name="optimizer-step") gradients, variables = zip(*optimizer.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm( gradients, self.config["model"]["gradientClippingFactor"]) self.gradientNorm = tf.global_norm(gradients, name="gradient-norm") return optimizer.apply_gradients(zip(gradients, variables)) def setupSummaries(self): tf.summary.scalar('cross-entropy', self.loss) tf.summary.scalar('gradient-norm', self.gradientNorm) self.mergedSummary = tf.summary.merge_all() self.trainingSummaryWriter = tf.summary.FileWriter( os.path.join(self.getExperimentDirectory(), 'training-summaries'), self.graph) if self.shouldRunValidation(): self.validationSummaryWriter = tf.summary.FileWriter( os.path.join(self.getExperimentDirectory(), 'validation-summaries'), self.graph) def evaluateLoss(self, batchOutputs, labels): return tf.identity(tf.losses.sparse_softmax_cross_entropy( labels=labels, logits=batchOutputs), name="loss") def processInputMiniBatch(self, inputs): return self.runBERT(inputs, inputs) def runBERT(self, inputSequence, historicSequence): # convert sequences to embeddings (output embeddings are Tensor(batch-size, sequence-length, hidden)) inputEmbeddings = self.convertToEmbeddings(inputSequence) inputEmbeddingsPositionallyEncoded = self.getPositionalEncodings( inputEmbeddings) # run encoder (encodedEmbeddings is (batch-size, sequence-length, hidden)) encodedEmbeddings = self.runEncoder(inputEmbeddingsPositionallyEncoded) return tf.layers.dense(encodedEmbeddings, units=self.vocab.getSize()) def convertToEmbeddings(self, sequenceIds): with tf.variable_scope("linear-embeddings", reuse=tf.AUTO_REUSE): wordEmbeddingsGlobal = tf.get_variable('word-embeddings', \ [self.vocab.getSize(), self.getEmbeddingSize()]) wordEmbeddings = tf.nn.embedding_lookup(wordEmbeddingsGlobal, sequenceIds) return wordEmbeddings def getPositionalEncodings(self, inputEmbeddings): #PE(pos,2i)=sin(pos/100002i/dmodel)PE(pos,2i)=sin(pos/100002i/dmodel) #PE(pos,2i+1)=cos(pos/100002i/dmodel)PE(pos,2i+1)=cos(pos/100002i/dmodel) where pospos is the position and ii is the dimension. batchSize = tf.shape(inputEmbeddings)[0] sequenceLength = tf.shape(inputEmbeddings)[1] hiddenDimension = tf.shape(inputEmbeddings)[2] sequenceRange = tf.reshape( tf.range(tf.cast(sequenceLength, tf.float32)), (1, sequenceLength, 1)) hiddenRange = tf.reshape( tf.range(tf.cast(hiddenDimension, tf.float32)), (1, 1, hiddenDimension)) rawPE = sequenceRange / (tf.pow( 10000.0, 2.0 * hiddenRange / tf.cast(hiddenDimension, tf.float32))) PE_cos = tf.cos(rawPE[:, 0::2, :]) PE_sin = tf.sin(rawPE[:, 1::2, :]) PE_cos = tf.reshape( PE_cos, (batchSize, tf.shape(PE_cos)[1], 1, hiddenDimension)) PE_sin = tf.reshape( PE_sin, (batchSize, tf.shape(PE_sin)[1], 1, hiddenDimension)) PE = tf.concat([PE_cos, PE_sin], axis=2) return inputEmbeddings + tf.reshape( PE, (batchSize, sequenceLength, hiddenDimension)) def runEncoder(self, embeddings): for i in range(self.getLayerCount()): right = self.multiHeadedAttention(embeddings) left = tf.layers.dense(right, units=embeddings.shape[-1]) embeddings = self.addAndNorm(left, right) return embeddings def multiHeadedAttention(self, embeddings): # Q,K,V are all -> projected embeddings projectedEmbeddings = self.projectEmbeddings(embeddings) attentionResults = self.attention(projectedEmbeddings) left = self.projectAttentionOutput(attentionResults) return self.addAndNorm(left, embeddings) def projectEmbeddings(self, embeddings): #input -> m, seqL, embedding size #output -> m, seqL, 3 * numberOfAttentionHeads * embedding size retVal = tf.layers.dense(embeddings, units=3 * self.getAttentionHeads() * embeddings.shape[-1]) return tf.reshape(retVal, [ tf.shape(retVal)[0], tf.shape(retVal)[1], 3, self.getAttentionHeads(), embeddings.shape[-1] ]) def attention(self, projectedEmbeddings): #m, seqL, (Q, K, V), attentionHeads, embedding size Q = projectedEmbeddings[:, :, 0, :, :] K = projectedEmbeddings[:, :, 1, :, :] V = projectedEmbeddings[:, :, 2, :, :] d_k = int(projectedEmbeddings.shape[-1]) m1 = tf.matmul(Q, K, transpose_b=True) / math.sqrt(d_k) smx = tf.nn.softmax(m1) return tf.matmul(smx, V) def projectAttentionOutput(self, attentionResults): #attentionResults -> m, seqL, attentionHeads, embedding size #new shape is (batch, sequence length, heads * embedding-size) batchSize = tf.shape(attentionResults)[0] sequenceLength = tf.shape(attentionResults)[1] reshapedEmbeddings = tf.reshape( attentionResults, (batchSize, sequenceLength, attentionResults.shape[-1] * attentionResults.shape[-2])) return tf.layers.dense(reshapedEmbeddings, units=attentionResults.shape[-1]) def addAndNorm(self, left, right): normalizedLeft = tf.contrib.layers.layer_norm(left) return tf.add(normalizedLeft, right) def checkpoint(self): """Creates a checkpoint of current model and saves to model directory. """ directory = self.checkpointer.getModelDirectory() logger.debug("Saving checkpoint to: " + str(directory)) self.checkpointer.checkpoint() exists = os.path.exists(directory) if exists: tempDirectory = directory + "-temp" shutil.move(directory, tempDirectory) with self.graph.as_default(): tf.saved_model.simple_save( self.session, directory, inputs={"input-tokens": self.inputTokens}, outputs={"output-probabilities": self.outputProbabilities}) if exists: shutil.rmtree(tempDirectory) """Functions to load configuration parameters.""" def getEmbeddingSize(self): return int(self.config["model"]["embeddingSize"]) def shouldRunValidation(self): return self.config["model"]["runValidation"] def getEpochs(self): return int(self.config["model"]["epochs"]) def getShouldCreateModel(self): if not "createNewModel" in self.config["model"]: return False return bool(self.config["model"]["createNewModel"]) def getStepsPerEpoch(self): return int(self.config["model"]["stepsPerEpoch"]) def getValidationStepsPerEpoch(self): return int(self.config["model"]["validationStepsPerEpoch"]) def getLayerCount(self): return self.config["model"]["layerCount"] def getAttentionHeads(self): return self.config["model"]["attentionHeads"] def getExperimentDirectory(self): return self.config["model"]["directory"]