def test_forward_works_with_subtract_combinations(self): linear = LinearSimilarity(2, 2, combination='x-y') linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5])) linear._bias = Parameter(torch.FloatTensor([0])) a_vectors = Variable(torch.FloatTensor([[1, 1], [-1, -1]])) b_vectors = Variable(torch.FloatTensor([[1, 0], [0, 1]])) result = linear(a_vectors, b_vectors).data.numpy() assert result.shape == (2,) assert_almost_equal(result, [.5, -.7])
def test_forward_does_a_weighted_product(self): linear = LinearSimilarity(3, 1, combination='x,y') linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5, 2.0, -1.0])) linear._bias = Parameter(torch.FloatTensor([.1])) a_vectors = torch.FloatTensor([[[1, 1, 1], [-1, -1, 0]]]) b_vectors = torch.FloatTensor([[[0], [1]]]) result = linear(Variable(a_vectors), Variable(b_vectors)).data.numpy() assert result.shape == (1, 2,) assert_almost_equal(result, [[2.3, -1.1]])
def test_forward_works_with_divide_combinations(self): linear = LinearSimilarity(2, 2, combination='x/y') linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5])) linear._bias = Parameter(torch.FloatTensor([0])) a_vectors = torch.FloatTensor([[1, 1], [-1, -1]]) b_vectors = torch.FloatTensor([[1, 2], [2, 1]]) result = linear(a_vectors, b_vectors).data.numpy() assert result.shape == (2,) assert_almost_equal(result, [-.05, -.35])
def test_forward_works_with_subtract_combinations(self): linear = LinearSimilarity(2, 2, combination='x-y') linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5])) linear._bias = Parameter(torch.FloatTensor([0])) a_vectors = torch.FloatTensor([[1, 1], [-1, -1]]) b_vectors = torch.FloatTensor([[1, 0], [0, 1]]) result = linear(a_vectors, b_vectors).data.numpy() assert result.shape == (2, ) assert_almost_equal(result, [.5, -.7])
def test_forward_works_with_divide_combinations(self): linear = LinearSimilarity(2, 2, combination='x/y') linear._weight_vector = Parameter(torch.FloatTensor([-.3, .5])) linear._bias = Parameter(torch.FloatTensor([0])) a_vectors = Variable(torch.FloatTensor([[1, 1], [-1, -1]])) b_vectors = Variable(torch.FloatTensor([[1, 2], [2, 1]])) result = linear(a_vectors, b_vectors).data.numpy() assert result.shape == (2, ) assert_almost_equal(result, [-.05, -.35])
def test_forward_does_a_weighted_product(self): linear = LinearSimilarity(3, 1, combination="x,y") linear._weight_vector = Parameter( torch.FloatTensor([-0.3, 0.5, 2.0, -1.0])) linear._bias = Parameter(torch.FloatTensor([0.1])) a_vectors = torch.FloatTensor([[[1, 1, 1], [-1, -1, 0]]]) b_vectors = torch.FloatTensor([[[0], [1]]]) result = linear(a_vectors, b_vectors).data.numpy() assert result.shape == (1, 2) assert_almost_equal(result, [[2.3, -1.1]])
def test_forward_works_with_higher_order_tensors(self): linear = LinearSimilarity(7, 7, combination='x,y') weights = numpy.random.rand(14) linear._weight_vector = Parameter(torch.from_numpy(weights).float()) linear._bias = Parameter(torch.FloatTensor([0.])) a_vectors = numpy.random.rand(5, 4, 3, 6, 7) b_vectors = numpy.random.rand(5, 4, 3, 6, 7) result = linear(Variable(torch.from_numpy(a_vectors).float()), Variable(torch.from_numpy(b_vectors).float())) result = result.data.numpy() assert result.shape == (5, 4, 3, 6) combined_vectors = numpy.concatenate([a_vectors[3, 2, 1, 3, :], b_vectors[3, 2, 1, 3, :]]) expected_result = numpy.dot(combined_vectors, weights) assert_almost_equal(result[3, 2, 1, 3], expected_result, decimal=6)
def test_forward_works_with_higher_order_tensors(self): linear = LinearSimilarity(7, 7, combination='x,y') weights = numpy.random.rand(14) linear._weight_vector = Parameter(torch.from_numpy(weights).float()) linear._bias = Parameter(torch.FloatTensor([0.])) a_vectors = numpy.random.rand(5, 4, 3, 6, 7) b_vectors = numpy.random.rand(5, 4, 3, 6, 7) result = linear(Variable(torch.from_numpy(a_vectors).float()), Variable(torch.from_numpy(b_vectors).float())) result = result.data.numpy() assert result.shape == (5, 4, 3, 6) combined_vectors = numpy.concatenate( [a_vectors[3, 2, 1, 3, :], b_vectors[3, 2, 1, 3, :]]) expected_result = numpy.dot(combined_vectors, weights) assert_almost_equal(result[3, 2, 1, 3], expected_result, decimal=6)
def test_can_construct_from_params(self): params = Params({ 'tensor_1_dim': 4, 'tensor_2_dim': 4, 'combination': 'x,y,x*y,y-x' }) linear = LinearSimilarity.from_params(params) assert list(linear._weight_vector.size()) == [16]
def __init__(self, vocab: Vocabulary, cf_a, preloaded_elmo=None) -> None: super(BidirectionalAttentionFlow_1, self).__init__(vocab, cf_a.regularizer) """ Initialize some data structures """ self.cf_a = cf_a # Bayesian data models self.VBmodels = [] self.LinearModels = [] """ ############## TEXT FIELD EMBEDDER with ELMO #################### text_field_embedder : ``TextFieldEmbedder`` Used to embed the ``question`` and ``passage`` ``TextFields`` we get as input to the model. """ if (cf_a.use_ELMO): if (type(preloaded_elmo) != type(None)): text_field_embedder = preloaded_elmo else: text_field_embedder = bidut.download_Elmo( cf_a.ELMO_num_layers, cf_a.ELMO_droput) print("ELMO loaded from disk or downloaded") else: text_field_embedder = None # embedder_out_dim = text_field_embedder.get_output_dim() self._text_field_embedder = text_field_embedder if (cf_a.Add_Linear_projection_ELMO): if (self.cf_a.VB_Linear_projection_ELMO): prior = Vil.Prior(**(cf_a.VB_Linear_projection_ELMO_prior)) print( "----------------- Bayesian Linear Projection ELMO --------------" ) linear_projection_ELMO = LinearVB( text_field_embedder.get_output_dim(), 200, prior=prior) self.VBmodels.append(linear_projection_ELMO) else: linear_projection_ELMO = torch.nn.Linear( text_field_embedder.get_output_dim(), 200) self._linear_projection_ELMO = linear_projection_ELMO """ ############## Highway layers #################### num_highway_layers : ``int`` The number of highway layers to use in between embedding the input and passing it through the phrase layer. """ Input_dimension_highway = None if (cf_a.Add_Linear_projection_ELMO): Input_dimension_highway = 200 else: Input_dimension_highway = text_field_embedder.get_output_dim() num_highway_layers = cf_a.num_highway_layers # Linear later to compute the start if (self.cf_a.VB_highway_layers): print("----------------- Bayesian Highway network --------------") prior = Vil.Prior(**(cf_a.VB_highway_layers_prior)) highway_layer = HighwayVB(Input_dimension_highway, num_highway_layers, prior=prior) self.VBmodels.append(highway_layer) else: highway_layer = Highway(Input_dimension_highway, num_highway_layers) highway_layer = TimeDistributed(highway_layer) self._highway_layer = highway_layer """ ############## Phrase layer #################### phrase_layer : ``Seq2SeqEncoder`` The encoder (with its own internal stacking) that we will use in between embedding tokens and doing the bidirectional attention. """ if cf_a.phrase_layer_dropout > 0: ## Create dropout layer dropout_phrase_layer = torch.nn.Dropout( p=cf_a.phrase_layer_dropout) else: dropout_phrase_layer = lambda x: x phrase_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(Input_dimension_highway, hidden_size=cf_a.phrase_layer_hidden_size, batch_first=True, bidirectional=True, num_layers=cf_a.phrase_layer_num_layers, dropout=cf_a.phrase_layer_dropout)) phrase_encoding_out_dim = cf_a.phrase_layer_hidden_size * 2 self._phrase_layer = phrase_layer self._dropout_phrase_layer = dropout_phrase_layer """ ############## Matrix attention layer #################### similarity_function : ``SimilarityFunction`` The similarity function that we will use when comparing encoded passage and question representations. """ # Linear later to compute the start if (self.cf_a.VB_similarity_function): prior = Vil.Prior(**(cf_a.VB_similarity_function_prior)) print( "----------------- Bayesian Similarity matrix --------------") similarity_function = LinearSimilarityVB( combination="x,y,x*y", tensor_1_dim=phrase_encoding_out_dim, tensor_2_dim=phrase_encoding_out_dim, prior=prior) self.VBmodels.append(similarity_function) else: similarity_function = LinearSimilarity( combination="x,y,x*y", tensor_1_dim=phrase_encoding_out_dim, tensor_2_dim=phrase_encoding_out_dim) matrix_attention = LegacyMatrixAttention(similarity_function) self._matrix_attention = matrix_attention """ ############## Modelling Layer #################### modeling_layer : ``Seq2SeqEncoder`` The encoder (with its own internal stacking) that we will use in between the bidirectional attention and predicting span start and end. """ ## Create dropout layer if cf_a.modeling_passage_dropout > 0: ## Create dropout layer dropout_modeling_passage = torch.nn.Dropout( p=cf_a.modeling_passage_dropout) else: dropout_modeling_passage = lambda x: x modeling_layer = PytorchSeq2SeqWrapper( torch.nn.LSTM(phrase_encoding_out_dim * 4, hidden_size=cf_a.modeling_passage_hidden_size, batch_first=True, bidirectional=True, num_layers=cf_a.modeling_passage_num_layers, dropout=cf_a.modeling_passage_dropout)) self._modeling_layer = modeling_layer self._dropout_modeling_passage = dropout_modeling_passage """ ############## Span Start Representation ##################### span_end_encoder : ``Seq2SeqEncoder`` The encoder that we will use to incorporate span start predictions into the passage state before predicting span end. """ encoding_dim = phrase_layer.get_output_dim() modeling_dim = modeling_layer.get_output_dim() span_start_input_dim = encoding_dim * 4 + modeling_dim # Linear later to compute the start if (self.cf_a.VB_span_start_predictor_linear): prior = Vil.Prior(**(cf_a.VB_span_start_predictor_linear_prior)) print( "----------------- Bayesian Span Start Predictor--------------" ) span_start_predictor_linear = LinearVB(span_start_input_dim, 1, prior=prior) self.VBmodels.append(span_start_predictor_linear) else: span_start_predictor_linear = torch.nn.Linear( span_start_input_dim, 1) self._span_start_predictor_linear = span_start_predictor_linear self._span_start_predictor = TimeDistributed( span_start_predictor_linear) """ ############## Span End Representation ##################### """ ## Create dropout layer if cf_a.span_end_encoder_dropout > 0: dropout_span_end_encode = torch.nn.Dropout( p=cf_a.span_end_encoder_dropout) else: dropout_span_end_encode = lambda x: x span_end_encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(encoding_dim * 4 + modeling_dim * 3, hidden_size=cf_a.modeling_span_end_hidden_size, batch_first=True, bidirectional=True, num_layers=cf_a.modeling_span_end_num_layers, dropout=cf_a.span_end_encoder_dropout)) span_end_encoding_dim = span_end_encoder.get_output_dim() span_end_input_dim = encoding_dim * 4 + span_end_encoding_dim self._span_end_encoder = span_end_encoder self._dropout_span_end_encode = dropout_span_end_encode if (self.cf_a.VB_span_end_predictor_linear): print( "----------------- Bayesian Span End Predictor--------------") prior = Vil.Prior(**(cf_a.VB_span_end_predictor_linear_prior)) span_end_predictor_linear = LinearVB(span_end_input_dim, 1, prior=prior) self.VBmodels.append(span_end_predictor_linear) else: span_end_predictor_linear = torch.nn.Linear(span_end_input_dim, 1) self._span_end_predictor_linear = span_end_predictor_linear self._span_end_predictor = TimeDistributed(span_end_predictor_linear) """ Dropput last layers """ if cf_a.spans_output_dropout > 0: dropout_spans_output = torch.nn.Dropout( p=cf_a.span_end_encoder_dropout) else: dropout_spans_output = lambda x: x self._dropout_spans_output = dropout_spans_output """ Checkings and accuracy """ # Bidaf has lots of layer dimensions which need to match up - these aren't necessarily # obvious from the configuration files, so we check here. check_dimensions_match(modeling_layer.get_input_dim(), 4 * encoding_dim, "modeling layer input dim", "4 * encoding dim") check_dimensions_match(Input_dimension_highway, phrase_layer.get_input_dim(), "text field embedder output dim", "phrase layer input dim") check_dimensions_match(span_end_encoder.get_input_dim(), 4 * encoding_dim + 3 * modeling_dim, "span end encoder input dim", "4 * encoding dim + 3 * modeling dim") self._span_start_accuracy = CategoricalAccuracy() self._span_end_accuracy = CategoricalAccuracy() self._span_accuracy = BooleanAccuracy() self._squad_metrics = SquadEmAndF1() """ mask_lstms : ``bool``, optional (default=True) If ``False``, we will skip passing the mask to the LSTM layers. This gives a ~2x speedup, with only a slight performance decrease, if any. We haven't experimented much with this yet, but have confirmed that we still get very similar performance with much faster training times. We still use the mask for all softmaxes, but avoid the shuffling that's required when using masking with pytorch LSTMs. """ self._mask_lstms = cf_a.mask_lstms """ ################### Initialize parameters ############################## """ #### THEY ARE ALL INITIALIZED WHEN INSTANTING THE COMPONENTS ### """ ####################### OPTIMIZER ################ """ optimizer = pytut.get_optimizers(self, cf_a) self._optimizer = optimizer
def test_weights_are_correct_sizes(self): linear = LinearSimilarity(tensor_1_dim=3, tensor_2_dim=6, combination='x,y') assert list(linear._weight_vector.size()) == [9] assert list(linear._bias.size()) == [1]
# The dimensionality of the co encoding_dim = encoded_question.size(-1) print("encoding_dim: ", encoding_dim) print("Question encoding: ", encoded_question.shape) print("Passage encoding: ", encoded_passage.shape) """ ################### SIMILARITY FUNCTION LAYER ######################################### NOTE: Since the LSTM implementation of PyTorch cannot apply dropout in the last layer, we just apply ourselves later """ print("-------------- SIMILARITY LAYER ---------------") similarity_function = LinearSimilarity(combination="x,y,x*y", tensor_1_dim=200, tensor_2_dim=200) matrix_attention = LegacyMatrixAttention(similarity_function) passage_question_similarity = matrix_attention(encoded_passage, encoded_question) # Shape: (batch_size, passage_length, question_length) print("passage question similarity: ", passage_question_similarity.shape) # Shape: (batch_size, passage_length, question_length) passage_question_attention = util.masked_softmax(passage_question_similarity, question_mask) # Shape: (batch_size, passage_length, encoding_dim) passage_question_vectors = util.weighted_sum(encoded_question, passage_question_attention)