def construct_lmcost(self, hidden_layer, hidden_layer_size, word_id_targets, lmcost_layer_size, max_vocab_size, name): lmcost_layer = recurrence.create_feedforward( hidden_layer, hidden_layer_size, lmcost_layer_size, "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name=name + "_tanh") lmcost_output = recurrence.create_feedforward( lmcost_layer, lmcost_layer_size, max_vocab_size + 1, "linear", fn_create_parameter_matrix=self.create_parameter_matrix, name=name + "_output") lmcost_output = theano.tensor.nnet.softmax( lmcost_output.reshape( (lmcost_layer.shape[0] * lmcost_layer.shape[1], max_vocab_size + 1))) word_id_targets = theano.tensor.switch( theano.tensor.ge(word_id_targets, max_vocab_size), max_vocab_size, word_id_targets) lmcost = theano.tensor.nnet.categorical_crossentropy( lmcost_output, word_id_targets.reshape((-1, ))).sum() return lmcost
def __init__(self, config): self.config = config self.params = collections.OrderedDict() self.rng = numpy.random.RandomState(config["random_seed"]) word_ids = theano.tensor.imatrix('word_ids') char_ids = theano.tensor.itensor3('char_ids') char_mask = theano.tensor.ftensor3('char_mask') label_ids = theano.tensor.imatrix('label_ids') learningrate = theano.tensor.fscalar('learningrate') cost = 0.0 input_tensor = None input_vector_size = 0 self.word_embeddings = self.create_parameter_matrix( 'word_embeddings', (config["n_words"], config["word_embedding_size"])) input_tensor = self.word_embeddings[word_ids] input_vector_size = config["word_embedding_size"] char_embeddings = self.create_parameter_matrix( 'char_embeddings', (config["n_chars"], config["char_embedding_size"])) char_input_tensor = char_embeddings[char_ids].reshape( (char_ids.shape[0] * char_ids.shape[1], char_ids.shape[2], config["char_embedding_size"])) char_mask_reshaped = char_mask.reshape( (char_ids.shape[0] * char_ids.shape[1], char_ids.shape[2])) char_output_tensor = recurrence.create_birnn( char_input_tensor, config["char_embedding_size"], char_mask_reshaped, config["char_recurrent_size"], return_combined=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="char_birnn") char_output_tensor = recurrence.create_feedforward( char_output_tensor, config["char_recurrent_size"] * 2, config["word_embedding_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="char_ff") char_output_tensor = char_output_tensor.reshape( (char_ids.shape[0], char_ids.shape[1], config["word_embedding_size"])) if config["char_integration_method"] == "input": input_tensor = theano.tensor.concatenate( [input_tensor, char_output_tensor], axis=2) input_vector_size += config["word_embedding_size"] elif config["char_integration_method"] == "attention": static_input_tensor = theano.gradient.disconnected_grad( input_tensor) is_unk = theano.tensor.eq(word_ids, config["unk_token_id"]) is_unk_tensor = is_unk.dimshuffle(0, 1, 'x') char_output_tensor_normalised = char_output_tensor / char_output_tensor.norm( 2, axis=2)[:, :, numpy.newaxis] static_input_tensor_normalised = static_input_tensor / static_input_tensor.norm( 2, axis=2)[:, :, numpy.newaxis] cosine_cost = 1.0 - (char_output_tensor_normalised * static_input_tensor_normalised).sum(axis=2) cost += theano.tensor.switch(is_unk, 0.0, cosine_cost).sum() attention_evidence_tensor = theano.tensor.concatenate( [input_tensor, char_output_tensor], axis=2) attention_output = recurrence.create_feedforward( attention_evidence_tensor, config["word_embedding_size"] * 2, config["word_embedding_size"], "tanh", self.create_parameter_matrix, "attention_tanh") attention_output = recurrence.create_feedforward( attention_output, config["word_embedding_size"], config["word_embedding_size"], "sigmoid", self.create_parameter_matrix, "attention_sigmoid") input_tensor = input_tensor * attention_output + char_output_tensor * ( 1.0 - attention_output) processed_tensor = recurrence.create_birnn( input_tensor, input_vector_size, None, config["word_recurrent_size"], return_combined=False, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn") processed_tensor = recurrence.create_feedforward( processed_tensor, config["word_recurrent_size"] * 2, config["narrow_layer_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="narrow_ff") W_output = self.create_parameter_matrix( 'W_output', (config["narrow_layer_size"], config["n_labels"])) bias_output = self.create_parameter_matrix('bias_output', (config["n_labels"], )) output = theano.tensor.dot(processed_tensor, W_output) + bias_output output = output[:, 1:-1, :] # removing <s> and </s> if config["crf_on_top"] == True: all_paths_scores, real_paths_scores, best_sequence, scores = crf.construct( "crf", output, config["n_labels"], label_ids, self.create_parameter_matrix) predicted_labels = best_sequence output_probs = scores cost += -(real_paths_scores - all_paths_scores).sum() else: output_probs = theano.tensor.nnet.softmax( output.reshape((word_ids.shape[0] * (word_ids.shape[1] - 2), config["n_labels"]))) predicted_labels = theano.tensor.argmax(output_probs.reshape( (word_ids.shape[0], (word_ids.shape[1] - 2), config["n_labels"])), axis=2) cost += theano.tensor.nnet.categorical_crossentropy( output_probs, label_ids.reshape((-1, ))).sum() gradients = theano.tensor.grad(cost, self.params.values(), disconnected_inputs='ignore') updates = lasagne.updates.adadelta(gradients, self.params.values(), learningrate) input_vars_train = [ word_ids, char_ids, char_mask, label_ids, learningrate ] input_vars_test = [word_ids, char_ids, char_mask, label_ids] output_vars = [cost, predicted_labels] self.train = theano.function(input_vars_train, output_vars, updates=updates, on_unused_input='ignore', allow_input_downcast=True) self.test = theano.function(input_vars_test, output_vars, on_unused_input='ignore', allow_input_downcast=True) self.predict = theano.function([word_ids, char_ids, char_mask], predicted_labels, on_unused_input='ignore', allow_input_downcast=True)
def __init__(self, config): self.config = config self.params = collections.OrderedDict() self.rng = numpy.random.RandomState(config["random_seed"]) word_ids = theano.tensor.imatrix('word_ids') char_ids = theano.tensor.itensor3('char_ids') char_mask = theano.tensor.ftensor3('char_mask') label_ids = theano.tensor.imatrix('label_ids') learningrate = theano.tensor.fscalar('learningrate') is_training = theano.tensor.iscalar('is_training') cost = 0.0 input_tensor = None input_vector_size = 0 self.word_embeddings = self.create_parameter_matrix( 'word_embeddings', (config["n_words"], config["word_embedding_size"])) input_tensor = self.word_embeddings[word_ids] input_vector_size = config["word_embedding_size"] char_embeddings = self.create_parameter_matrix( 'char_embeddings', (config["n_chars"], config["char_embedding_size"])) char_input_tensor = char_embeddings[char_ids].reshape( (char_ids.shape[0] * char_ids.shape[1], char_ids.shape[2], config["char_embedding_size"])) char_mask_reshaped = char_mask.reshape( (char_ids.shape[0] * char_ids.shape[1], char_ids.shape[2])) char_output_tensor = recurrence.create_birnn( char_input_tensor, config["char_embedding_size"], char_mask_reshaped, config["char_recurrent_size"], return_combined=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="char_birnn") char_output_tensor = recurrence.create_feedforward( char_output_tensor, config["char_recurrent_size"] * 2, config["word_embedding_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="char_ff") char_output_tensor = char_output_tensor.reshape( (char_ids.shape[0], char_ids.shape[1], config["word_embedding_size"])) if config["char_integration_method"] == "input": input_tensor = theano.tensor.concatenate( [input_tensor, char_output_tensor], axis=2) input_vector_size += config["word_embedding_size"] elif config["char_integration_method"] == "attention": static_input_tensor = theano.gradient.disconnected_grad( input_tensor) is_unk = theano.tensor.eq(word_ids, config["unk_token_id"]) is_unk_tensor = is_unk.dimshuffle(0, 1, 'x') char_output_tensor_normalised = char_output_tensor / char_output_tensor.norm( 2, axis=2)[:, :, numpy.newaxis] static_input_tensor_normalised = static_input_tensor / static_input_tensor.norm( 2, axis=2)[:, :, numpy.newaxis] cosine_cost = 1.0 - (char_output_tensor_normalised * static_input_tensor_normalised).sum(axis=2) cost += theano.tensor.switch(is_unk, 0.0, cosine_cost).sum() attention_evidence_tensor = theano.tensor.concatenate( [input_tensor, char_output_tensor], axis=2) attention_output = recurrence.create_feedforward( attention_evidence_tensor, config["word_embedding_size"] * 2, config["word_embedding_size"], "tanh", self.create_parameter_matrix, "attention_tanh") attention_output = recurrence.create_feedforward( attention_output, config["word_embedding_size"], config["word_embedding_size"], "sigmoid", self.create_parameter_matrix, "attention_sigmoid") input_tensor = input_tensor * attention_output + char_output_tensor * ( 1.0 - attention_output) if config["dropout_input"] > 0.0: p = config["dropout_input"] trng = theano.tensor.shared_randomstreams.RandomStreams(seed=1) dropout_mask = trng.binomial(n=1, p=1 - p, size=input_tensor.shape, dtype=floatX) input_train = dropout_mask * input_tensor input_test = (1 - p) * input_tensor input_tensor = theano.tensor.switch( theano.tensor.neq(is_training, 0), input_train, input_test) recurrent_forward = recurrence.create_lstm( input_tensor.dimshuffle(1, 0, 2), input_vector_size, None, config["word_recurrent_size"], False, go_backwards=False, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn_forward").dimshuffle(1, 0, 2) recurrent_backward = recurrence.create_lstm( input_tensor.dimshuffle(1, 0, 2), input_vector_size, None, config["word_recurrent_size"], False, go_backwards=True, fn_create_parameter_matrix=self.create_parameter_matrix, name="word_birnn_backward").dimshuffle(1, 0, 2) if config["lmcost_gamma"] > 0.0: lmcost_max_vocab_size = min(self.config["n_words"], self.config["lmcost_max_vocab_size"]) cost += config["lmcost_gamma"] * self.construct_lmcost( recurrent_forward[:, :-1, :], config["word_recurrent_size"], word_ids[:, 1:], self.config["lmcost_layer_size"], lmcost_max_vocab_size, "lmcost_forward") cost += config["lmcost_gamma"] * self.construct_lmcost( recurrent_backward[:, 1:, :], config["word_recurrent_size"], word_ids[:, :-1], self.config["lmcost_layer_size"], lmcost_max_vocab_size, "lmcost_backward") processed_tensor = theano.tensor.concatenate( [recurrent_forward, recurrent_backward], axis=2) processed_tensor_size = config["word_recurrent_size"] * 2 if config["narrow_layer_size"] > 0: processed_tensor = recurrence.create_feedforward( processed_tensor, processed_tensor_size, config["narrow_layer_size"], "tanh", fn_create_parameter_matrix=self.create_parameter_matrix, name="narrow_ff") processed_tensor_size = config["narrow_layer_size"] W_output = self.create_parameter_matrix( 'W_output', (processed_tensor_size, config["n_labels"])) bias_output = self.create_parameter_matrix('bias_output', (config["n_labels"], )) output = theano.tensor.dot(processed_tensor, W_output) + bias_output output = output[:, 1:-1, :] # removing <s> and </s> if config["crf_on_top"] == True: all_paths_scores, real_paths_scores, best_sequence, scores = crf.construct( "crf", output, config["n_labels"], label_ids, self.create_parameter_matrix) predicted_labels = best_sequence output_probs = scores cost += -(real_paths_scores - all_paths_scores).sum() else: output_probs_ = theano.tensor.nnet.softmax( output.reshape((word_ids.shape[0] * (word_ids.shape[1] - 2), config["n_labels"]))) output_probs = output_probs_.reshape( (word_ids.shape[0], (word_ids.shape[1] - 2), config["n_labels"])) predicted_labels = theano.tensor.argmax(output_probs, axis=2) cost += theano.tensor.nnet.categorical_crossentropy( output_probs_, label_ids.reshape((-1, ))).sum() gradients = theano.tensor.grad(cost, self.params.values(), disconnected_inputs='ignore') if config["opt_strategy"] == "adadelta": updates = lasagne.updates.adadelta(gradients, self.params.values(), learningrate) elif config["opt_strategy"] == "adam": updates = lasagne.updates.adam(gradients, self.params.values(), learningrate) elif config["opt_strategy"] == "sgd": updates = lasagne.updates.sgd(gradients, self.params.values(), learningrate) else: raise ValueError("Unknown optimisation strategy: " + str(config["opt_strategy"])) input_vars_train = [ word_ids, char_ids, char_mask, label_ids, learningrate ] input_vars_test = [word_ids, char_ids, char_mask, label_ids] output_vars = [cost, predicted_labels] self.train = theano.function(input_vars_train, output_vars, updates=updates, on_unused_input='ignore', allow_input_downcast=True, givens=({ is_training: numpy.cast['int32'](1) })) self.test = theano.function(input_vars_test, output_vars, on_unused_input='ignore', allow_input_downcast=True, givens=({ is_training: numpy.cast['int32'](0) })) self.test_return_probs = theano.function(input_vars_test, output_vars + [ output_probs, ], on_unused_input='ignore', allow_input_downcast=True, givens=({ is_training: numpy.cast['int32'](0) }))