def __init__(self, vocab: Vocabulary, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, hidden_dim_maxpool: int = 1024, class_embs: bool=True, reasoning_use_obj: bool=True, reasoning_use_answer: bool=True, reasoning_use_question: bool=True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, initializer: InitializerApplicator = InitializerApplicator(), ): super(AttentionQA, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) ################################################################################################### self.rnn_input_dropout = TimeDistributed(InputVariationalDropout(input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question dim = sum([d for d, to_pool in [(reasoning_encoder.get_output_dim(), self.pool_reasoning), (span_encoder.get_output_dim(), self.pool_answer), (span_encoder.get_output_dim(), self.pool_question)] if to_pool]) self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, v_dim, q_dim, num_hid, norm='weight', act='LeakyReLU', dropout=0.3): super(Att_Bilinear_layer2_keycat_textual_visual, self).__init__() # norm_layer = get_norm(norm) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=q_dim, matrix_2_dim=v_dim, # matrix_2_dim=512 ) self.num_hid = num_hid
def __init__(self, pretrained=True, average_pool=True, semantic=True, final_dim=512, input_dropout=0.3, reasoning_use_obj=True, reasoning_use_answer=True, reasoning_use_question=True, pool_reasoning=True, pool_answer=True, pool_question=True): super().__init__() # self.detector = SimpleDetector(pretrained=pretrained, # average_pool=average_pool, semantic=semantic, final_dim=final_dim) self.reasoning_encoder = TimeDistributed( PytorchSeq2SeqWrapper( torch.nn.LSTM(1536, 256, num_layers=2, batch_first=True, bidirectional=True))) self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_attention = BilinearMatrixAttention( matrix_1_dim=final_dim, matrix_2_dim=final_dim, ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=final_dim, matrix_2_dim=final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question InitializerApplicator(self)
def test_forward_does_a_bilinear_product(self): params = Params({"matrix_1_dim": 2, "matrix_2_dim": 2}) bilinear = BilinearMatrixAttention.from_params(params) bilinear._weight_matrix = Parameter(torch.FloatTensor([[-0.3, 0.5], [2.0, -1.0]])) bilinear._bias = Parameter(torch.FloatTensor([0.1])) a_vectors = torch.FloatTensor([[[1, 1], [2, 2]]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2, 2) assert_almost_equal(result, [[[1.8, -0.4], [3.5, -0.9]]])
def test_forward_does_a_bilinear_product(self): params = Params({ 'matrix_1_dim': 2, 'matrix_2_dim': 2, }) bilinear = BilinearMatrixAttention.from_params(params) bilinear._weight_matrix = Parameter(torch.FloatTensor([[-.3, .5], [2.0, -1.0]])) bilinear._bias = Parameter(torch.FloatTensor([.1])) a_vectors = torch.FloatTensor([[[1, 1], [2, 2]]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2, 2) assert_almost_equal(result, [[[1.8, -.4], [3.5, -.9]]])
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, attended_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, regularizer: Optional[RegularizerApplicator] = None, detector_final_dim: int = 512, dropout: float = 0.5, initializer: InitializerApplicator = InitializerApplicator() ) -> None: """ :param vocab: :param text_field_embedder: :param encoder: :param attended_encoder: :param output_feedforward: :param regularizer: :param detector_final_dim: :param dropout: :param initializer: """ super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._encoder = encoder self.detector = ROIDetector(detector_final_dim) if dropout: self.dropout = nn.Dropout(dropout) self.rnn_input_dropout = InputVariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None self.obj_attention = BilinearMatrixAttention( matrix_1_dim=encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim) self._attended_encoder = attended_encoder self._output_feedforward = output_feedforward self._accuracy = CategoricalAccuracy() self._loss = nn.CrossEntropyLoss() initializer(self)
def test_forward_does_a_bilinear_product_when_using_biases(self): params = Params({ u'matrix_1_dim': 2, u'matrix_2_dim': 2, u'use_input_biases': True }) bilinear = BilinearMatrixAttention.from_params(params) bilinear._weight_matrix = Parameter(torch.FloatTensor([[-.3, .5, 1.0], [2.0, -1.0, -1.0], [1.0, 0.5, 1.0]])) bilinear._bias = Parameter(torch.FloatTensor([.1])) a_vectors = torch.FloatTensor([[[1, 1], [2, 2]]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2, 2) assert_almost_equal(result, [[[3.8, 1.1], [5.5, 0.6]]])
def test_forward_does_a_bilinear_product_when_using_biases(self): params = Params({ 'matrix_1_dim': 2, 'matrix_2_dim': 2, 'use_input_biases': True }) bilinear = BilinearMatrixAttention.from_params(params) bilinear._weight_matrix = Parameter(torch.FloatTensor([[-.3, .5, 1.0], [2.0, -1.0, -1.0], [1.0, 0.5, 1.0]])) bilinear._bias = Parameter(torch.FloatTensor([.1])) a_vectors = torch.FloatTensor([[[1, 1], [2, 2]]]) b_vectors = torch.FloatTensor([[[1, 0], [0, 1]]]) result = bilinear(a_vectors, b_vectors).detach().numpy() assert result.shape == (1, 2, 2) assert_almost_equal(result, [[[3.8, 1.1], [5.5, 0.6]]])
def __init__(self, config): super(RobertaForRopes, self).__init__(config) self.roberta = RobertaModel(config) self.find_object1 = MLP(config.hidden_size, 1) self.find_object2 = MLP(config.hidden_size, 1) self.find_TP = MLP(config.hidden_size, 1) self.bb_matrix_attention = LinearMatrixAttention( tensor_1_dim=config.hidden_size, tensor_2_dim=config.hidden_size, combination="x,y,x*y") self.bs_bilinear_imilairty = BilinearMatrixAttention( matrix_1_dim=config.hidden_size, matrix_2_dim=config.hidden_size) self.ss_matrix_attention = LinearMatrixAttention( tensor_1_dim=config.hidden_size, tensor_2_dim=config.hidden_size, combination="x,y,x*y") self.rel_SPo1_SPo2 = Relevance(config) self.pol_TP_SP = MLP(config.hidden_size * 2, 2) # self.rel_TPo1_TPo2 = None # self.answer_according_to_question = None self.init_weights()
def __init__(self, args): super().__init__(args) self.n_graph_attn_composition_layers = args.n_graph_attn_composition_layers self.output_size = self.transformer.config.hidden_size self.graph_dim = args.graph_dim if self.use_semantic_graph: self.emb_proj = nn.Linear(self.transformer.config.hidden_size, self.graph_dim) def get_gnn_instance(n_layers): return RGCN( num_bases=args.graph_n_bases, h_dim=self.graph_dim, num_relations=self.num_relations, num_hidden_layers=n_layers, dropout=args.graph_dropout, activation=self.activation, ) self.rgcn = get_gnn_instance(args.n_graph_layers) if self.n_graph_attn_composition_layers > 0: self.composition_rgcn = get_gnn_instance( self.n_graph_attn_composition_layers) self.attn_biaffine = BilinearMatrixAttention(self.graph_dim, self.graph_dim, use_input_biases=True) self.attn_proj = nn.Linear(4 * self.graph_dim, self.graph_dim) self.graph_output_proj = nn.Linear(self.graph_dim, self.graph_dim) self.output_size += (2 if self.is_sentence_pair_task else 1) * self.graph_dim if self.args.post_combination_layernorm: self.post_combination_layernorm = nn.LayerNorm( self.output_size, eps=self.transformer.config.layer_norm_eps)
def __init__(self, vocab: Vocabulary, text_encoder: Seq2SeqEncoder, word_embedder: TextFieldEmbedder, enable_training_log: bool = False, inp_drop_rate: float = 0.2, out_drop_rate: float = 0.2, loss_weights: List = (0.2, 0.4, 0.4), super_mode: str = 'before', backbone: str = 'unet', unet_down_channel: int = 256, feature_sel: int = 127): super(UnifiedFollowUp, self).__init__(vocab) self.text_encoder = text_encoder self.word_embedder = word_embedder """ Define model arch choices """ self.backbone = backbone # input dropout if inp_drop_rate > 0: self.var_inp_dropout = InputVariationalDropout(p=inp_drop_rate) else: self.var_inp_dropout = lambda x: x # output dropout if out_drop_rate > 0: self.var_out_dropout = InputVariationalDropout(p=out_drop_rate) else: self.var_out_dropout = lambda x: x self.hidden_size = text_encoder.get_output_dim() // 2 if text_encoder.is_bidirectional() \ else text_encoder.get_output_dim() self.output_size = text_encoder.get_output_dim() # ele -> element wise multiply # dot -> dot product # cos -> cosine similarity # emb_dot -> embedding dot product # emb_cos -> embedding cosine similarity # linear -> linear similarity # bilinear -> bilinear similarity feature_sel = feature_sel sel_arr = "{0:07b}".format(int(feature_sel)) nni_choices = ['ele', 'dot', 'cos', 'emb_dot', 'emb_cos', 'linear', 'bilinear'] self.segment_choices = [nni_choices[i] for i in range(7) if sel_arr[i] == '1'] # if expand bi-direction, we will regard forward/backward as two channels self.expand_bidir = False self.similar_function = ModuleDict({ 'ele': ElementWiseMatrixAttention(), 'dot': DotProductMatrixAttention(), 'cos': CosineMatrixAttention(), 'emb_dot': DotProductMatrixAttention(), 'emb_cos': CosineMatrixAttention(), 'bilinear': BilinearMatrixAttention(matrix_1_dim=self.output_size, matrix_2_dim=self.output_size), 'linear': LinearMatrixAttention(tensor_1_dim=self.output_size, tensor_2_dim=self.output_size) }) self.attn_channel = 0 for choice in self.segment_choices: if choice == 'ele': self.attn_channel += self.output_size elif choice in ['dot', 'cos', 'emb_dot', 'emb_cos', 'bilinear', 'linear']: if self.expand_bidir: self.attn_channel += 2 else: self.attn_channel += 1 self.class_mapping: Dict[str, int] = get_class_mapping(super_mode=super_mode) # Here we have two choices now, one is MLP, and another is UNet if self.backbone == 'unet': self.segmentation_net = AttentionUNet(input_channels=self.attn_channel, class_number=len(self.class_mapping.keys()), down_channel=unet_down_channel) else: raise Exception("Currently we do not support for other arches.") class_zero_weight = loss_weights[0] class_one_weight = loss_weights[1] self.register_buffer('weight_tensor', torch.tensor([class_zero_weight, class_one_weight, 1 - class_zero_weight - class_one_weight])) self.loss = nn.CrossEntropyLoss(ignore_index=-1, weight=self.weight_tensor) # initialize metrics measurement self.metrics = {'ROUGE': BatchAverage(), '_ROUGE1': BatchAverage(), '_ROUGE2': BatchAverage(), # TODO: You can speed up the code by disable BLEU since # the corpus-based BLEU metric is much time-consuming. 'BLEU': CorpusBLEUMetric(), 'EM': BatchAverage(), 'F1': FScoreMetric(prefix="1"), 'F2': FScoreMetric(prefix="2"), 'F3': FScoreMetric(prefix="3")} parameter_num = count_parameters(self) print(parameter_num) self.min_width = 8 self.min_height = 8 self.enable_training_log = enable_training_log
combination='x,y', activation=tanh) output = attention(vector, matrix) print('Output from LinearAttention:', output) # MatrixAttention sequence_length1 = 10 sequence_length2 = 15 # dot product attention only allows matrices of the same size matrix1 = torch.rand((batch_size, sequence_length1, embedding_dim1)) matrix2 = torch.rand((batch_size, sequence_length2, embedding_dim1)) matrix_attention = DotProductMatrixAttention() output = matrix_attention(matrix1, matrix2) print('Output shape of DotProductMatrixAttention:', output.shape) # bilinear & linear attention allows inputs of different sizes matrix1 = torch.rand((1, sequence_length1, embedding_dim1)) matrix2 = torch.rand((1, sequence_length2, embedding_dim2)) matrix_attention = BilinearMatrixAttention( matrix_1_dim=embedding_dim1, matrix_2_dim=embedding_dim2) output = matrix_attention(matrix1, matrix2) print('Output shape of BilinearMatrixAttention:', output.shape) matrix_attention = LinearMatrixAttention( tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2, combination='x,y', activation=tanh) output = matrix_attention(matrix1, matrix2) print('Output shape of LinearMatrixAttention:', output.shape)
def train_valid_base_text_model(model_name): """ :param model_name: the full model name to use :return: """ token_indexer = {"tokens": ELMoTokenCharactersIndexer()} def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ] reader = TextExpDataSetReader(token_indexers=token_indexer, tokenizer=tokenizer, add_numeric_data=False) train_instances = reader.read(train_data_file_path) validation_instances = reader.read(validation_data_file_path) vocab = Vocabulary() # TODO: change this if necessary # batch_size should be: 10 or 9 depends on the input # and not shuffle so all the data of the same pair will be in the same batch iterator = BasicIterator( batch_size=batch_size) # , instances_per_epoch=10) # sorting_keys=[('sequence_review', 'list_num_tokens')]) iterator.index_with(vocab) options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # TODO: check the output of this # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=2) # word_embeddings = elmo_embedder elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) review_attention_layer = models.AttentionSoftMaxLayer( BilinearMatrixAttention(word_embeddings.get_output_dim(), word_embeddings.get_output_dim())) seq_attention_layer = models.AttentionSoftMaxLayer( DotProductMatrixAttention()) feed_forward = FeedForward(input_dim=batch_size, num_layers=2, hidden_dims=[batch_size, 1], activations=ReLU(), dropout=[0.2, 0.0]) fc_review_rep = FeedForward(input_dim=124, num_layers=1, hidden_dims=[10], activations=ReLU()) criterion = nn.MSELoss() metrics_dict = { 'mean_absolute_error': MeanAbsoluteError(), } model = models.BasicTextModel( word_embedding=word_embeddings, review_representation_layer=review_attention_layer, seq_representation_layer=seq_attention_layer, vocab=vocab, criterion=criterion, metrics_dict=metrics_dict, classifier_feedforward=feed_forward, fc_review_rep=fc_review_rep) optimizer = optim.Adam(model.parameters(), lr=0.1) num_epochs = 2 run_log_directory = utils.set_folder( datetime.now().strftime( f'{model_name}_{num_epochs}_epochs_%d_%m_%Y_%H_%M_%S'), 'logs') if not os.path.exists(run_log_directory): os.makedirs(run_log_directory) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_instances, validation_dataset=validation_instances, num_epochs=num_epochs, shuffle=False, serialization_dir=run_log_directory, patience=10, histogram_interval=10, ) model_dict = trainer.train() print(f'{model_name}: evaluation measures are:') for key, value in model_dict.items(): print(f'{key}: {value}')
def __init__( self, vocab: Vocabulary, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, hidden_dim_maxpool: int = 1024, class_embs: bool = True, reasoning_use_obj: bool = True, reasoning_use_answer: bool = True, reasoning_use_question: bool = True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, initializer: InitializerApplicator = InitializerApplicator(), ): super(MultiHopAttentionQAFreezeDetRes101, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=False, final_dim=512) ################################################################################################### # freeze everything related to conv net for submodule in self.detector.backbone.modules(): # if isinstance(submodule, BatchNorm2d): # submodule.track_running_stats = False for p in submodule.parameters(): p.requires_grad = False for submodule in self.detector.after_roi_align.modules(): # if isinstance(submodule, BatchNorm2d): # submodule.track_running_stats = False for p in submodule.parameters(): p.requires_grad = False self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question dim = sum([ d for d, to_pool in [( reasoning_encoder.get_output_dim(), self.pool_reasoning ), (span_encoder.get_output_dim(), self.pool_answer ), (span_encoder.get_output_dim(), self.pool_question)] if to_pool ]) self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, config): super(Relevance, self).__init__() self.logits_SPo1 = BilinearMatrixAttention( matrix_1_dim=config.hidden_size, matrix_2_dim=config.hidden_size) self.logits_SPo2 = BilinearMatrixAttention( matrix_1_dim=config.hidden_size, matrix_2_dim=config.hidden_size)
def __init__( self, span_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, class_embs: bool = True, initializer: InitializerApplicator = InitializerApplicator(), learned_omcs: dict = {}, ): # VCR dataset becomes unpicklable due to VCR.vocab, but we don't need # to pass in vocab from the dataset anyway as the BERT embeddings are # pretrained and stored in h5 files per dataset instance. Just pass # a dummy vocab instance for init. vocab = Vocabulary() super(KeyValueAttentionTrunk, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) span_dim = span_encoder.get_output_dim() self.question_mlp = torch.nn.Sequential( # 2 (bidirectional) * 4 (num_answers) * dim -> dim torch.nn.Linear(8 * span_dim, span_dim), torch.nn.Tanh(), ) self.answer_mlp = torch.nn.Sequential( # 2 (bidirectional) * dim -> 2 (key-value) * dim torch.nn.Linear(2 * span_dim, 2 * span_dim), torch.nn.Tanh(), ) self.obj_mlp = torch.nn.Sequential( # obj_dim -> 2 (key-value) * dim torch.nn.Linear(self.detector.final_dim, 2 * span_dim), torch.nn.Tanh(), ) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.kv_transformer = KeyValueTransformer( dim=span_dim, num_heads=8, num_steps=4, ) self.omcs_index = None if learned_omcs.get('enabled', False): use_sentence_embs = learned_omcs.get('use_sentence_embeddings', True) omcs_embs, self.omcs_index = self.load_omcs(use_sentence_embs) # Let's replicate the OMCS embeddings to each device to attend over them # after FAISS lookup. We could also do faiss.search_and_reconstruct, but # that prevents us from using quantized indices for faster search which # we might need to. self.register_buffer('omcs_embs', omcs_embs) self.omcs_mlp = torch.nn.Sequential( torch.nn.Linear(768, self.omcs_index.d), ) self.k = learned_omcs.get('max_neighbors', 5) self.similarity_thresh = learned_omcs.get('similarity_thresh', 0.0) initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, text_encoder: Seq2SeqEncoder, target_encoder: Seq2VecEncoder, feedforward: Optional[FeedForward] = None, target_field_embedder: Optional[TextFieldEmbedder] = None, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, dropout: float = 0.0, target_scale: bool = False, context_preserving: bool = False) -> None: ''' :param vocab: vocab : A Vocabulary, required in order to compute sizes for input/output projections. :param text_field_embedder: Used to embed the text and target text if target_field_embedder is None but the target_encoder is not None. :param text_encoder: Sequence Encoder that will create the representation of each token in the context sentence. :param target_encoder: Encoder that will create the representation of target text tokens. :param feedforward: An optional feed forward layer to apply after either the text encoder if target encoder is None. Else it would be after the target and the text encoded representations have been concatenated. :param target_field_embedder: Used to embed the target text to give as input to the target_encoder. Thus this allows a seperate embedding for text and target text. :param target_concat_text_embedding: Whether or not the target should be concatenated to the each word embedding within the text before being encoded. :param initializer: Used to initialize the model parameters. :param regularizer: If provided, will be used to calculate the regularization penalty during training. :param word_dropout: Dropout that is applied after the embedding of the tokens/words. It will drop entire words with this probabilty. :param dropout: To apply dropout after each layer apart from the last layer. All dropout that is applied to timebased data will be `variational dropout`_ all else will be standard dropout. .. _variational dropout: https://papers.nips.cc/paper/6241-a-theoretically-grounded-application-of-dropout-in-recurrent-neural-networks.pdf ''' super().__init__(vocab, regularizer) self.text_field_embedder = text_field_embedder self.target_field_embedder = target_field_embedder self.num_classes = self.vocab.get_vocab_size("labels") self.text_encoder = text_encoder self.target_encoder = TimeDistributed(target_encoder) self.feedforward = feedforward if self.feedforward: self.time_feedforward = TimeDistributed(self.feedforward) self.attention_layer = BilinearMatrixAttention( text_encoder.get_output_dim(), target_encoder.get_output_dim()) # Whether to concat the encoded text representation with the weighted # representation from the attention self.context_preserving = context_preserving if feedforward is not None: output_dim = self.feedforward.get_output_dim() else: if self.context_preserving: output_dim = (text_encoder.get_output_dim() * 2) else: output_dim = text_encoder.get_output_dim() self.label_projection = TimeDistributed( Linear(output_dim, self.num_classes)) self.metrics = {"accuracy": CategoricalAccuracy()} self.f1_metrics = {} # F1 Scores label_index_name = self.vocab.get_index_to_token_vocabulary('labels') for label_index, label_name in label_index_name.items(): label_name = f'F1_{label_name.capitalize()}' self.f1_metrics[label_name] = F1Measure(label_index) self._variational_dropout = InputVariationalDropout(dropout) self._naive_dropout = Dropout(dropout) self._time_naive_dropout = TimeDistributed(self._naive_dropout) self._time_variational_dropout = TimeDistributed( self._variational_dropout) self.target_scale = target_scale self.loss = torch.nn.CrossEntropyLoss() check_dimensions_match(text_field_embedder.get_output_dim(), text_encoder.get_input_dim(), "text field embedding dim", "text encoder input dim") # Ensure that the dimensions of the target or text field embedder and # the target encoder match target_field_embedder_dim = text_field_embedder.get_output_dim() target_field_error = "text field embedding dim" if self.target_field_embedder: target_field_embedder_dim = target_field_embedder.get_output_dim() target_field_error = "target field embedding dim" check_dimensions_match(target_field_embedder_dim, target_encoder.get_input_dim(), target_field_error, "target encoder input dim") initializer(self)
def __init__( self, vocab: Vocabulary, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, hidden_dim_maxpool: int = 1024, class_embs: bool = True, reasoning_use_obj: bool = True, reasoning_use_answer: bool = True, reasoning_use_question: bool = True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, reasoning_use_vision: bool = True, initializer: InitializerApplicator = InitializerApplicator(), ): super(AttentionQA, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) self.extractor = SimpleExtractor(pretrained=True, num_classes=365, arch='resnet50') ################################################################################################### self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) # add scene classification visual feature and word embedding feature self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question self.reasoning_use_vision = reasoning_use_vision dim = sum([ d for d, to_pool in [( reasoning_encoder.get_output_dim(), self.pool_reasoning ), (span_encoder.get_output_dim(), self.pool_answer ), (span_encoder.get_output_dim(), self.pool_question)] if to_pool ]) self.projection = torch.nn.Conv2d(2048, self.detector.final_dim, kernel_size=1, stride=2, padding=1, bias=True) self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self._accuracy = CategoricalAccuracy() # I want to replace the CrossEntropyLoss with LSR # self._loss = LabelSmoothingLoss(size=4,smoothing=0.1) self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__( self, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.3, hidden_dim_maxpool: int = 1024, class_embs: bool = True, reasoning_use_obj: bool = True, reasoning_use_answer: bool = True, reasoning_use_question: bool = True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, initializer: InitializerApplicator = InitializerApplicator(), learned_omcs: dict = {}, ): # VCR dataset becomes unpicklable due to VCR.vocab, but we don't need # to pass in vocab from the dataset anyway as the BERT embeddings are # pretrained and stored in h5 files per dataset instance. Just pass # a dummy vocab instance for init. vocab = Vocabulary() super(AttentionQATrunk, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) ################################################################################################### self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question self.output_dim = sum([ d for d, to_pool in [( reasoning_encoder.get_output_dim(), self.pool_reasoning ), (span_encoder.get_output_dim(), self.pool_answer ), (span_encoder.get_output_dim(), self.pool_question)] if to_pool ]) self.omcs_index = None if learned_omcs.get('enabled', False): use_sentence_embs = learned_omcs.get('use_sentence_embeddings', True) omcs_embs, self.omcs_index = self.load_omcs(use_sentence_embs) # Let's replicate the OMCS embeddings to each device to attend over them # after FAISS lookup. We could also do faiss.search_and_reconstruct, but # that prevents us from using quantized indices for faster search which # we might need to. self.register_buffer('omcs_embs', omcs_embs) self.omcs_mlp = torch.nn.Sequential( torch.nn.Linear(768, self.omcs_index.d), ) self.k = learned_omcs.get('max_neighbors', 5) self.similarity_thresh = learned_omcs.get('similarity_thresh', 0.0) initializer(self)
def __init__( self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, passage_startend_predictor, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, num_implicit_nums: int = None, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0, ): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] # Parameters for answer start/end prediction from PassageAttention self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = passage_startend_predictor # torch.nn.Linear(self.passage_attention_to_span.get_output_dim(), 2) # Parameters for answer start/end pred directly from passage encoding (direct PassageSpanAnswer from 1step prog) self.oneshot_psa_startend_predictor = torch.nn.Linear( passage_encoding_dim, 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.implicit_num_embeddings = torch.nn.Parameter( torch.FloatTensor(num_implicit_nums, passage_encoding_dim)) torch.nn.init.normal_(self.implicit_num_embeddings, mean=0.0, std=0.001) self.implicitnum_bilinear_attention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # self.filter_matrix_attention = LinearMatrixAttention( # tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y" # ) self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") self._endpoint_span_extractor = EndpointSpanExtractor( input_dim=passage_encoding_dim, combination="x,y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = torch.nn.Linear( self.passage_attention_to_span.get_output_dim(), 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def train_valid_base_text_decision_results_ep_model( model_name: str, single_round_label: bool, use_only_prev_round: bool, train_data_file_name: str, validation_data_file_name: str, no_history: bool = False): """ This function train and validate model that use texts and numbers. :param: model_name: the full model name :param single_round_label: the label to use: single round of total payoff :param use_only_prev_round: if to use all the history or only the previous round :param train_data_file_name: the name of the train_data to use :param validation_data_file_name: the name of the validation_data to use :param no_history: if we don't want to use any history data :return: """ token_indexer = {"tokens": ELMoTokenCharactersIndexer()} def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ] reader = TextExpDataSetReader(token_indexers=token_indexer, tokenizer=tokenizer, add_numeric_data=True, use_only_prev_round=use_only_prev_round, single_round_label=single_round_label, three_losses=True, no_history=no_history) train_data_file_inner_path = os.path.join(data_directory, train_data_file_name) validation_data_file_inner_path = os.path.join(data_directory, validation_data_file_name) train_instances = reader.read(train_data_file_inner_path) validation_instances = reader.read(validation_data_file_inner_path) vocab = Vocabulary() # TODO: change this if necessary # batch_size should be: 10 or 9 depends on the input # and not shuffle so all the data of the same pair will be in the same batch iterator = BasicIterator(batch_size=9) # , instances_per_epoch=10) # sorting_keys=[('sequence_review', 'list_num_tokens')]) iterator.index_with(vocab) options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # TODO: check the output of this # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=2) # word_embeddings = elmo_embedder elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) review_attention_layer =\ models.AttentionSoftMaxLayer(BilinearMatrixAttention(word_embeddings.get_output_dim(), word_embeddings.get_output_dim())) seq_attention_layer = models.AttentionSoftMaxLayer( DotProductMatrixAttention()) fc_review_rep_output_dim = reader.max_tokens_len fc_review_rep = FeedForward(input_dim=reader.max_tokens_len, num_layers=1, hidden_dims=[fc_review_rep_output_dim], activations=ReLU()) # seq_attention_layer = FeedForward(input_dim=) # numbers_lstm: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(2, 10, bidirectional=True, batch_first=True)) # the shape of the flatten data rep feed_forward_input_dim = reader.max_seq_len * (fc_review_rep_output_dim + reader.number_length) feed_forward_classification = FeedForward(input_dim=feed_forward_input_dim, num_layers=1, hidden_dims=[2], activations=ReLU(), dropout=[0.0]) feed_forward_regression = FeedForward(input_dim=feed_forward_input_dim, num_layers=1, hidden_dims=[1], activations=ReLU(), dropout=[0.0]) criterion_classification = nn.BCEWithLogitsLoss() criterion_regression = nn.MSELoss() metrics_dict = { "accuracy": CategoricalAccuracy(), # 'auc': Auc(), # 'F1measure': F1Measure(positive_label=1), } model = models.BasicTextDecisionResultModel( word_embedding=word_embeddings, review_representation_layer=review_attention_layer, seq_representation_layer=seq_attention_layer, vocab=vocab, classifier_feedforward_classification=feed_forward_classification, classifier_feedforward_regression=feed_forward_regression, fc_review_rep=fc_review_rep, criterion_classification=criterion_classification, criterion_regression=criterion_regression, metrics_dict=metrics_dict, add_numbers=True, max_tokens_len=reader.max_tokens_len, ) optimizer = optim.Adam(model.parameters(), lr=0.1) num_epochs = 2 run_log_directory = utils.set_folder( datetime.now().strftime( f'{model_name}_{num_epochs}_epochs_%d_%m_%Y_%H_%M_%S'), 'logs') trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_instances, validation_dataset=validation_instances, num_epochs=num_epochs, shuffle=False, serialization_dir=run_log_directory, patience=10, histogram_interval=10, ) model_dict = trainer.train() print(f'{model_name}: evaluation measures are:') for key, value in model_dict.items(): if 'accuracy' in key: value = value * 100 print(f'{key}: {value}') # save the model predictions model.predictions.to_csv(os.path.join(run_log_directory, 'predictions.csv'))
def __init__( self, vocab: Vocabulary, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.1, hidden_dim_maxpool: int = 512, class_embs: bool = True, reasoning_use_obj: bool = True, reasoning_use_answer: bool = True, reasoning_use_question: bool = True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, preload_path: str = "source_model.th", initializer: InitializerApplicator = InitializerApplicator(), ): super(AttentionQA, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) ################################################################################################### self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) self.BiLSTM = TimeDistributed(MYLSTM(1280, 512, 256)) self.source_encoder = TimeDistributed(source_LSTM(768, 256)) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.span_attention_2 = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.obj_attention_2 = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self._matrix_attention = DotProductMatrixAttention() #self._matrix_attention = MatrixAttention(similarity_function) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question dim = sum([ d for d, to_pool in [( reasoning_encoder.get_output_dim(), self.pool_reasoning ), (span_encoder.get_output_dim(), self.pool_answer ), (span_encoder.get_output_dim(), self.pool_question)] if to_pool ]) self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self.final_mlp_2 = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self.answer_BN = torch.nn.Sequential(BatchNorm1d(512)) self.question_BN = torch.nn.Sequential(BatchNorm1d(512)) self.source_answer_BN = torch.nn.Sequential(BatchNorm1d(512)) self.source_question_BN = torch.nn.Sequential(BatchNorm1d(512)) self.image_BN = BatchNorm1d(512) self.final_BN = torch.nn.Sequential(BatchNorm1d(512)) self.final_mlp_linear = torch.nn.Sequential(torch.nn.Linear(512, 1)) self.final_mlp_pool = torch.nn.Sequential( torch.nn.Linear(2560, 512), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), ) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self) if preload_path is not None: logger.info("Preloading!") preload = torch.load(preload_path) own_state = self.state_dict() for name, param in preload.items(): #if name[0:8] == "_encoder": # suffix = "._module."+name[9:] # logger.info("preload paramter {}".format("span_encoder"+suffix)) # own_state["span_encoder"+suffix].copy_(param) #新引入的source_encoder if name[0:4] == "LSTM": suffix = "._module" + name[4:] logger.info("preload paramter {}".format("source_encoder" + suffix)) own_state["source_encoder" + suffix].copy_(param)