def functions(sequence_length): """ Return two functions * The first function does prediction. * The second function does learning. """ global cached_functions p = (sequence_length) if len(cached_functions.keys()) > 1: # This is problematic because we use global variables for the model parameters. # Hence, we might be unsafe, if we are using the wrong model parameters globally. assert 0 if p not in cached_functions: print "Need to construct graph for sequence_length=%d..." % (sequence_length) # Create the sequence_length inputs. # Each is a t.xmatrix(), initial word embeddings (provided by # Jason + Ronan) to be transformed into an initial representation. # We could use a vector, but instead we use a matrix with one row. sequence = [t.xmatrix() for i in range(sequence_length)] correct_repr = t.xmatrix() noise_repr = t.xmatrix() # correct_scorebias = t.xscalar() # noise_scorebias = t.xscalar() correct_scorebias = t.xvector() noise_scorebias = t.xvector() stackedsequence = stack(sequence) predictrepr = dot(stackedsequence, output_weights) + output_biases correct_score = score(correct_repr, predictrepr) + correct_scorebias noise_score = score(noise_repr, predictrepr) + noise_scorebias loss = t.clip(1 - correct_score + noise_score, 0, 1e999) (doutput_weights, doutput_biases) = t.grad(loss, [output_weights, output_biases]) dsequence = t.grad(loss, sequence) (dcorrect_repr, dnoise_repr) = t.grad(loss, [correct_repr, noise_repr]) (dcorrect_scorebias, dnoise_scorebias) = t.grad(loss, [correct_scorebias, noise_scorebias]) #print "REMOVEME", len(dcorrect_inputs) predict_inputs = sequence + [correct_repr, correct_scorebias, output_weights, output_biases] train_inputs = sequence + [correct_repr, noise_repr, correct_scorebias, noise_scorebias, output_weights, output_biases] predict_outputs = [predictrepr, correct_score] train_outputs = [loss, predictrepr, correct_score, noise_score] + dsequence + [dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias] # train_outputs = [loss, correct_repr, correct_score, noise_repr, noise_score] import theano.gof.graph nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) print "About to compile predict function over %d ops [nodes]..." % nnodes predict_function = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % (sequence_length) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) print "About to compile train function over %d ops [nodes]..." % nnodes train_function = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % (sequence_length) cached_functions[p] = (predict_function, train_function) return cached_functions[p]
def functions(sequence_length): """ Return two functions * The first function does prediction. * The second function does learning. """ global cached_functions cachekey = (sequence_length) if len(cached_functions.keys()) > 1: # This is problematic because we use global variables for the model parameters. # Hence, we might be unsafe, if we are using the wrong model parameters globally. assert 0 if cachekey not in cached_functions: print "Need to construct graph for sequence_length=%d..." % ( sequence_length) # Create the sequence_length inputs. # Each is a t.xmatrix(), initial word embeddings (provided by # Jason + Ronan) to be transformed into an initial representation. # We could use a vector, but instead we use a matrix with one row. correct_inputs = [t.xmatrix() for i in range(sequence_length)] noise_inputs = [t.xmatrix() for i in range(sequence_length)] learning_rate = t.xscalar() stacked_correct_inputs = stack(correct_inputs) stacked_noise_inputs = stack(noise_inputs) correct_score, correct_prehidden = score(stacked_correct_inputs) noise_score, noise_prehidden = score(stacked_noise_inputs) unpenalized_loss = t.clip(1 - correct_score + noise_score, 0, 1e999) from hyperparameters import HYPERPARAMETERS if HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"] != 0: l1penalty = t.sum( t.abs_(stacked_correct_inputs) + t.abs_(stacked_noise_inputs), axis=1).T * HYPERPARAMETERS["CW_EMBEDDING_L1_PENALTY"] else: l1penalty = t.as_tensor_variable(numpy.asarray(0, dtype=floatX)) # l1penalty = t.as_tensor_variable(numpy.asarray((0,), dtype=floatX)) loss = (unpenalized_loss.T + l1penalty).T # import sys # print >> sys.stderr, "FIXME: MODEL_LEARNING_RATE = fixed at 0.001" # MODEL_LEARNING_RATE = t.as_tensor_variable(numpy.asarray(0.001, dtype=floatX)) total_loss = t.sum(loss) (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases) = t.grad( total_loss, [hidden_weights, hidden_biases, output_weights, output_biases]) dcorrect_inputs = t.grad(total_loss, correct_inputs) dnoise_inputs = t.grad(total_loss, noise_inputs) #print "REMOVEME", len(dcorrect_inputs) predict_inputs = correct_inputs train_inputs = correct_inputs + noise_inputs + [learning_rate] verbose_predict_inputs = predict_inputs predict_outputs = [correct_score] train_outputs = dcorrect_inputs + dnoise_inputs + [ loss, unpenalized_loss, l1penalty, correct_score, noise_score ] verbose_predict_outputs = [correct_score, correct_prehidden] import theano.gof.graph nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) print "About to compile predict function over %d ops [nodes]..." % nnodes predict_function = pfunc(predict_inputs, predict_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % ( sequence_length) nnodes = len( theano.gof.graph.ops(verbose_predict_inputs, verbose_predict_outputs)) print "About to compile predict function over %d ops [nodes]..." % nnodes verbose_predict_function = pfunc(verbose_predict_inputs, verbose_predict_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % ( sequence_length) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) print "About to compile train function over %d ops [nodes]..." % nnodes train_function = pfunc( train_inputs, train_outputs, mode=COMPILE_MODE, updates=[(p, p - learning_rate * gp) for p, gp in zip(( hidden_weights, hidden_biases, output_weights, output_biases), (dhidden_weights, dhidden_biases, doutput_weights, doutput_biases))]) print "...done constructing graph for sequence_length=%d" % ( sequence_length) cached_functions[cachekey] = (predict_function, train_function, verbose_predict_function) return cached_functions[cachekey]
def functions(sequence_length): """ Return two functions * The first function does prediction. * The second function does learning. """ global cached_functions p = (sequence_length) if len(cached_functions.keys()) > 1: # This is problematic because we use global variables for the model parameters. # Hence, we might be unsafe, if we are using the wrong model parameters globally. assert 0 if p not in cached_functions: print "Need to construct graph for sequence_length=%d..." % ( sequence_length) # Create the sequence_length inputs. # Each is a t.xmatrix(), initial word embeddings (provided by # Jason + Ronan) to be transformed into an initial representation. # We could use a vector, but instead we use a matrix with one row. sequence = [t.xmatrix() for i in range(sequence_length)] correct_repr = t.xmatrix() noise_repr = t.xmatrix() # correct_scorebias = t.xscalar() # noise_scorebias = t.xscalar() correct_scorebias = t.xvector() noise_scorebias = t.xvector() stackedsequence = stack(sequence) predictrepr = dot(stackedsequence, output_weights) + output_biases correct_score = score(correct_repr, predictrepr) + correct_scorebias noise_score = score(noise_repr, predictrepr) + noise_scorebias loss = t.clip(1 - correct_score + noise_score, 0, 1e999) (doutput_weights, doutput_biases) = t.grad(loss, [output_weights, output_biases]) dsequence = t.grad(loss, sequence) (dcorrect_repr, dnoise_repr) = t.grad(loss, [correct_repr, noise_repr]) (dcorrect_scorebias, dnoise_scorebias) = t.grad(loss, [correct_scorebias, noise_scorebias]) #print "REMOVEME", len(dcorrect_inputs) predict_inputs = sequence + [ correct_repr, correct_scorebias, output_weights, output_biases ] train_inputs = sequence + [ correct_repr, noise_repr, correct_scorebias, noise_scorebias, output_weights, output_biases ] predict_outputs = [predictrepr, correct_score] train_outputs = [ loss, predictrepr, correct_score, noise_score ] + dsequence + [ dcorrect_repr, dnoise_repr, doutput_weights, doutput_biases, dcorrect_scorebias, dnoise_scorebias ] # train_outputs = [loss, correct_repr, correct_score, noise_repr, noise_score] import theano.gof.graph nnodes = len(theano.gof.graph.ops(predict_inputs, predict_outputs)) print "About to compile predict function over %d ops [nodes]..." % nnodes predict_function = theano.function(predict_inputs, predict_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % ( sequence_length) nnodes = len(theano.gof.graph.ops(train_inputs, train_outputs)) print "About to compile train function over %d ops [nodes]..." % nnodes train_function = theano.function(train_inputs, train_outputs, mode=COMPILE_MODE) print "...done constructing graph for sequence_length=%d" % ( sequence_length) cached_functions[p] = (predict_function, train_function) return cached_functions[p]