def trainFunc(self, inputPremise, inputHypothesis, yTarget, learnRate, gradMax, L2regularization): premProject = T.dot(inputPremise, self.W_proj) hypoProject = T.dot(inputHypothesis, self.W_proj) sumPrem = premProject.sum(axis=1) + self.b_proj sumHypo = hypoProject.sum(axis=1) + self.b_proj # Should be dim (n, dimProject) where n is batch size concatVec = T.concatenate([sumPrem, sumHypo], axis=1) activeVec = T.tanh(concatVec) yPred = T.nnet.softmax(activeVec) entropy = T.nnet.categorical_crossentropy(yPred, yTarget).mean() cost = entropy + computeParamNorms([self.W_proj], L2regularization) costFunc = theano.function([inputPremise, inputHypothesis, yTarget], cost) grads, _ = computeGrads(inputPremise, inputHypothesis, yTarget, cost, gradMax, self.params.values()) fGradShared, fUpdate = rmsprop(grads, learnRate, inputPremise, inputHypothesis, yTarget, cost, self.params) return fGradShared, fUpdate, costFunc
def trainFunc(self, inputPremise, inputHypothesis, yTarget, learnRate, gradMax, L2regularization): premProject = T.dot(inputPremise, self.W_proj) hypoProject = T.dot(inputHypothesis, self.W_proj) sumPrem = premProject.sum(axis=1) + self.b_proj sumHypo = hypoProject.sum( axis=1 ) + self.b_proj # Should be dim (n, dimProject) where n is batch size concatVec = T.concatenate([sumPrem, sumHypo], axis=1) activeVec = T.tanh(concatVec) yPred = T.nnet.softmax(activeVec) entropy = T.nnet.categorical_crossentropy(yPred, yTarget).mean() cost = entropy + computeParamNorms([self.W_proj], L2regularization) costFunc = theano.function([inputPremise, inputHypothesis, yTarget], cost) grads, _ = computeGrads(inputPremise, inputHypothesis, yTarget, cost, gradMax, self.params.values()) fGradShared, fUpdate = rmsprop(grads, learnRate, inputPremise, inputHypothesis, yTarget, cost, self.params) return fGradShared, fUpdate, costFunc
def costFunc( self, inputPremise, inputHypothesis, yTarget, layer, L2regularization, dropoutRate, premiseOutputs, batchSize, sentenceAttention=False, wordwiseAttention=False, numTimestepsHypothesis=1, numTimestepsPremise=1, ): """ Compute end-to-end cost function for a collection of input data. :param layer: whether we are doing a forward computation in the premise or hypothesis layer :return: Symbolic expression for cost function as well as theano function for computing cost expression. """ if layer == "premise": _ = self.forwardRun(inputPremise, numTimestepsPremise) elif layer == "hypothesis": timestepOut, _ = self.forwardRun(inputHypothesis, numTimestepsHypothesis) # Apply sentence level attention -- notation consistent with paper if sentenceAttention: hstar = self.applySentenceAttention(premiseOutputs, self.finalOutputVal, numTimestepsPremise) self.finalOutputVal = hstar # Apply word by word attention if wordwiseAttention: hstar = self.applyWordwiseAttention( premiseOutputs, timestepOut[0], self.finalOutputVal, batchSize, numTimestepsPremise, numTimestepsHypothesis, ) self.finalOutputVal = hstar # Apply dropout here before projecting to categories? apply to finalOutputVal self.finalOutputVal = self.applyDropout(self.finalOutputVal, self.dropoutMode, dropoutRate) catOutput = self.projectToCategories() cost = self.computeCrossEntropyCost(catOutput, yTarget) # Get params specific to cell and add L2 regularization to cost LSTMparams = [self.params[cParam] for cParam in self.LSTMcellParams] cost = cost + computeParamNorms(LSTMparams, L2regularization) return ( cost, theano.function( [inputPremise, inputHypothesis, yTarget], cost, name="LSTM_cost_function", on_unused_input="warn" ), )
def testRegularization(): layer = HiddenLayer(2, 2, 2, "test", numCategories=3) premise = T.ftensor3("testP") hypothesis = T.ftensor3("testH") yTarget = T.fmatrix("testyTarget") hyp = np.array([[[0.5, 0.6]], [[0.3, 0.8]]], dtype=np.float32) prem = np.array([[[0.5, 0.6]], [[0.3, 0.8]]], dtype=np.float32) yTargetNP = np.array([[0., 1., 0.]], dtype=np.float32) layer.printLayerParams() cost, fn = layer.costFunc(premise, hypothesis, yTarget, "hypothesis", 0.0, 1) costValue = fn(prem, hyp, yTargetNP) print "Cost: ", costValue LSTMparams = [layer.params[cParam] for cParam in layer.LSTMcellParams] print "L2 norm all params: ", computeParamNorms(layer.params.values(), 0.5).eval() print "L2 norm LSTM cell params: ", computeParamNorms(LSTMparams, 0.5).eval()
def costFunc(self, inputPremise, inputHypothesis, yTarget, layer, L2regularization, dropoutRate, premiseOutputs, batchSize, sentenceAttention=False, wordwiseAttention=False, numTimestepsHypothesis=1, numTimestepsPremise=1): """ Compute end-to-end cost function for a collection of input data. :param layer: whether we are doing a forward computation in the premise or hypothesis layer :return: Symbolic expression for cost function as well as theano function for computing cost expression. """ if layer == "premise": _ = self.forwardRun(inputPremise, numTimestepsPremise) elif layer == "hypothesis": timestepOut, _ = self.forwardRun(inputHypothesis, numTimestepsHypothesis) # Apply sentence level attention -- notation consistent with paper if sentenceAttention: hstar = self.applySentenceAttention(premiseOutputs, self.finalOutputVal, numTimestepsPremise) self.finalOutputVal = hstar # Apply word by word attention if wordwiseAttention: hstar = self.applyWordwiseAttention(premiseOutputs, timestepOut[0], self.finalOutputVal, batchSize, numTimestepsPremise, numTimestepsHypothesis) self.finalOutputVal = hstar # Apply dropout here before projecting to categories? apply to finalOutputVal self.finalOutputVal = self.applyDropout(self.finalOutputVal, self.dropoutMode, dropoutRate) catOutput = self.projectToCategories() cost = self.computeCrossEntropyCost(catOutput, yTarget) # Get params specific to cell and add L2 regularization to cost LSTMparams = [self.params[cParam] for cParam in self.LSTMcellParams] cost = cost + computeParamNorms(LSTMparams, L2regularization) return cost, theano.function([inputPremise, inputHypothesis, yTarget], cost, name='LSTM_cost_function', on_unused_input="warn")