def test_dot_product_similarity(self): # example use case: a batch of size 2, # with a time element component (e.g. sentences of length 2) each word is a vector of length 3. # it is comparing this with another input of the same type output = DotProductMatrixAttention()(torch.FloatTensor([[[0, 0, 0], [4, 5, 6]], [[-7, -8, -9], [10, 11, 12]]]), torch.FloatTensor([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])) # for the first batch there is # no correlation between the first words of the input matrix # but perfect correlation for the second word # for the second batch there is # negative correlation for the first words # a correlation for the second word assert_almost_equal(output.numpy(), numpy.array([[[0, 0], [32, 77]], [[-194, -266], [266, 365]]]), decimal=2)
def __init__(self, vocab: Vocabulary, encoder_keys: List[str], mask_key: str, pair2vec_config_file: str, pair2vec_model_file: str, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, similarity_function: SimilarityFunction, projection_feedforward: FeedForward, inference_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, output_logit: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), dropout: float = 0.5, pair2vec_dropout: float = 0.0, bidirectional_pair2vec: bool = True, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._vocab = vocab self.pair2vec = util.get_pair2vec(pair2vec_config_file, pair2vec_model_file) self._encoder_keys = encoder_keys self._mask_key = mask_key self._text_field_embedder = text_field_embedder self._projection_feedforward = projection_feedforward self._encoder = encoder from allennlp.modules.matrix_attention import DotProductMatrixAttention self._matrix_attention = DotProductMatrixAttention() self._inference_encoder = inference_encoder self._pair2vec_dropout = torch.nn.Dropout(pair2vec_dropout) self._bidirectional_pair2vec = bidirectional_pair2vec if dropout: self.dropout = torch.nn.Dropout(dropout) self.rnn_input_dropout = VariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None self._output_feedforward = output_feedforward self._output_logit = output_logit self._num_labels = vocab.get_vocab_size(namespace="labels") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoder: Seq2SeqEncoder, similarity_function: MatrixAttention, projection_feedforward: FeedForward, inference_encoder: Seq2SeqEncoder, output_feedforward: FeedForward, output_logit: FeedForward, initializer: InitializerApplicator = InitializerApplicator(), dropout: float = 0.5, regularizer: Optional[RegularizerApplicator] = None) -> None: super().__init__(vocab, regularizer) self._text_field_embedder = text_field_embedder self._encoder = encoder self._matrix_attention = DotProductMatrixAttention() self._projection_feedforward = projection_feedforward self._inference_encoder = inference_encoder if dropout: self.dropout = torch.nn.Dropout(dropout) self.rnn_input_dropout = VariationalDropout(dropout) else: self.dropout = None self.rnn_input_dropout = None self._output_feedforward = output_feedforward self._output_logit = output_logit self._num_labels = vocab.get_vocab_size(namespace="labels") check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), "text field embedding dim", "encoder input dim") check_dimensions_match(encoder.get_output_dim() * 4, projection_feedforward.get_input_dim(), "encoder output dim", "projection feedforward input") check_dimensions_match(projection_feedforward.get_output_dim(), inference_encoder.get_input_dim(), "proj feedforward output dim", "inference lstm input dim") self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self)
def __init__(self, vocab, modeling_layer, dropout=0.2, mask_lstms=True, initializer=InitializerApplicator()): super(AttnPairEncoder, self).__init__(vocab) self._matrix_attention = DotProductMatrixAttention() self._modeling_layer = modeling_layer self.pad_idx = vocab.get_token_index(vocab._padding_token) d_out_model = modeling_layer.get_output_dim() self.output_dim = d_out_model self._dropout = torch.nn.Dropout(p=dropout) if dropout > 0 else lambda x: x self._mask_lstms = mask_lstms initializer(self)
def __init__(self, pooler: Seq2VecEncoder, knowledge_encoder: Seq2SeqEncoder = None): super().__init__() self.pooler = pooler pass_thru = PassThroughEncoder(pooler.get_input_dim()) self.knowledge_encoder = TimeDistributed( knowledge_encoder or pass_thru) # TimeDistributed(context_encoder) self.knowledge_attn = DotProductMatrixAttention( ) # CosineMatrixAttention() # self.attn = DotProductMatrixAttention() self.input_dim = pooler.get_input_dim() self.output_dim = pooler.get_output_dim()
def __init__( self, vocab: Vocabulary, bert_model: Union[str, BertModel], embedding_dropout: float = 0.0, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, label_smoothing: float = None, ignore_span_metric: bool = False, srl_eval_path: str = DEFAULT_SRL_EVAL_PATH, parser_path: str = "/home/rizwan/.allennlp/cache/elmo-allennlp_constituency_parser" ) -> None: super().__init__(vocab, regularizer) if isinstance(bert_model, str): self.bert_model = BertModel.from_pretrained(bert_model) else: self.bert_model = bert_model self.num_classes = self.vocab.get_vocab_size("labels") if srl_eval_path is not None: # For the span based evaluation, we don't want to consider labels # for verb, because the verb index is provided to the model. self.span_metric = SrlEvalScorer(srl_eval_path, ignore_classes=["V"]) else: self.span_metric = None self.tag_projection_layer = Linear( 2 * self.bert_model.config.hidden_size, self.num_classes) self.embedding_dropout = Dropout(p=embedding_dropout) self._label_smoothing = label_smoothing self.ignore_span_metric = ignore_span_metric device = 0 if torch.cuda.is_available() else -1 self.parser = Predictor.from_path(parser_path, cuda_device=device) self.syntax_roberta = RobertaModel.from_pretrained( '../fairseq/checkpoints_768', 'checkpoint_best.pt') self.syntax_roberta.eval() self.matrix_attention = DotProductMatrixAttention() initializer(self)
def __init__(self, vocab: Vocabulary, token_embedder: TokenEmbedder, num_labels: int) -> None: super().__init__(vocab) self._text_field_embedder = BasicTextFieldEmbedder( {"tokens": token_embedder}) dim = token_embedder.get_output_dim() self._attend_feedforward = TimeDistributed( FeedForward(dim, 1, 100, torch.nn.ReLU(), 0.2)) self._matrix_attention = DotProductMatrixAttention() self._compare_feedforward = TimeDistributed( FeedForward(dim * 2, 1, 100, torch.nn.ReLU(), 0.2)) # linear denotes "lambda x: x" self._aggregate_feedforward = FeedForward(200, 1, num_labels, PassThrough(), 0.0) self._num_labels = num_labels self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss()
def __init__( self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, passage_startend_predictor, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, num_implicit_nums: int = None, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0, ): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] # Parameters for answer start/end prediction from PassageAttention self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = passage_startend_predictor # torch.nn.Linear(self.passage_attention_to_span.get_output_dim(), 2) # Parameters for answer start/end pred directly from passage encoding (direct PassageSpanAnswer from 1step prog) self.oneshot_psa_startend_predictor = torch.nn.Linear( passage_encoding_dim, 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.implicit_num_embeddings = torch.nn.Parameter( torch.FloatTensor(num_implicit_nums, passage_encoding_dim)) torch.nn.init.normal_(self.implicit_num_embeddings, mean=0.0, std=0.001) self.implicitnum_bilinear_attention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # self.filter_matrix_attention = LinearMatrixAttention( # tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y" # ) self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") self._endpoint_span_extractor = EndpointSpanExtractor( input_dim=passage_encoding_dim, combination="x,y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
tanh = Activation.by_name('tanh')() attention = LinearAttention( tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2, combination='x,y', activation=tanh) output = attention(vector, matrix) print('Output from LinearAttention:', output) # MatrixAttention sequence_length1 = 10 sequence_length2 = 15 # dot product attention only allows matrices of the same size matrix1 = torch.rand((batch_size, sequence_length1, embedding_dim1)) matrix2 = torch.rand((batch_size, sequence_length2, embedding_dim1)) matrix_attention = DotProductMatrixAttention() output = matrix_attention(matrix1, matrix2) print('Output shape of DotProductMatrixAttention:', output.shape) # bilinear & linear attention allows inputs of different sizes matrix1 = torch.rand((1, sequence_length1, embedding_dim1)) matrix2 = torch.rand((1, sequence_length2, embedding_dim2)) matrix_attention = BilinearMatrixAttention( matrix_1_dim=embedding_dim1, matrix_2_dim=embedding_dim2) output = matrix_attention(matrix1, matrix2) print('Output shape of BilinearMatrixAttention:', output.shape) matrix_attention = LinearMatrixAttention( tensor_1_dim=embedding_dim1, tensor_2_dim=embedding_dim2, combination='x,y', activation=tanh)
def __init__( self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, coverage_ff: FeedForward, relation_predictor: FeedForward, scale_relation_loss: float = 1.0, aggregate: str = "max", combination: str = "x,y", answer_choice_combination: Optional[str] = None, coverage_combination: Optional[str] = None, var_dropout: float = 0.0, use_projection: bool = False, ignore_spans: bool = True, ignore_relns: bool = False, ignore_ann: bool = False, span_extractor: Optional[SpanExtractor] = None, reln_ff: Optional[FeedForward] = None, attention: Optional[MatrixAttention] = None, encoder: Optional[Seq2SeqEncoder] = None, initializer: InitializerApplicator = InitializerApplicator() ) -> None: """ :param vocab: AllenNLP Vocabulary :param text_field_embedder: AllenNLP Textfield embedder :param coverage_ff: Feedforward network that computes the "Fact-Relevance" score_f i.e. how well does the fact "cover" the question + answer :param relation_predictor: Feedforward network that predicts the relation label R_j :param scale_relation_loss: Scalar used to scale the relation loss term, \lambda :param aggregate: Pooling function used to aggregate question/fact vector representations in "Relation Prediction Score". Choices: max, avg, last :param combination: Combination string used to combine vector representation \bigotimes :param answer_choice_combination: If set, use this combination string instead of combination for combining the answer-based and choice-based fact representation :param coverage_combination: If set, use this combination string instead of combination for combining the question-choice-based fact rep and fact rep :param var_dropout: Variational dropout probability on the input embeddings :param use_projection: If set to true, learn a projector to map relation representations to a #rel-dimensional vector. Otherwise, the relation predictor should produce embeddings that match the #rels. :param ignore_spans: If set to true, don't use span representation of the answers in the fact_choice_question_rep (default: true) :param ignore_relns: If set to true, don't use the relation labels/scores (no relation representations computed or scored) :param ignore_ann: If set to true, ignore all auxilliary annotation i.e. spans and relations Use the entire fact to compute answer span-based representations. No loss computed against the relation label. Note that latent relation representations will still be computed :param span_extractor: SpanExtractor used to compute answer span representation :param reln_ff: Feedforward used to calculate the relation prediction score :param attention: Attention function used :param encoder: Encoder used to convert seq of word embeddings into contextual (e.g. LSTM) representations :param initializer: Initializer used for parameters """ super(SpanRelationPredFactAttModel, self).__init__(vocab) self._text_field_embedder = text_field_embedder self._coverage_ff = coverage_ff if attention: self._attention = attention else: self._attention = DotProductMatrixAttention() if var_dropout > 0.0: self._var_dropout = InputVariationalDropout(var_dropout) else: self._var_dropout = None self._num_relations = vocab.get_vocab_size(namespace="relation_labels") self._ignore_spans = ignore_spans self._aggregate = aggregate self._scale_relation_loss = scale_relation_loss if span_extractor is None and not ignore_spans: raise ConfigurationError( "ignore_spans set to False but no span_extractor provided!") self._span_extractor = span_extractor self._relation_predictor = relation_predictor # simple projector if use_projection: self._relation_projector = torch.nn.Linear( self._relation_predictor.get_output_dim(), self._num_relations) else: self._relation_projector = None self._combination = combination if answer_choice_combination: self._answer_choice_combination = answer_choice_combination else: self._answer_choice_combination = combination if coverage_combination: self._coverage_combination = coverage_combination else: self._coverage_combination = combination self._ignore_ann = ignore_ann self._ignore_relns = ignore_relns if reln_ff is None and not ignore_relns: raise ConfigurationError( "ignore_relns set to False but no reln_ff provided!") self._reln_ff = reln_ff self._encoder = encoder self._aggr_label_accuracy = BooleanAccuracy() self._aggr_choice_accuracy = CategoricalAccuracy() self._relation_loss = torch.nn.BCEWithLogitsLoss() self._choice_loss = torch.nn.CrossEntropyLoss() initializer(self)
def train_valid_base_text_model(model_name): """ :param model_name: the full model name to use :return: """ token_indexer = {"tokens": ELMoTokenCharactersIndexer()} def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ] reader = TextExpDataSetReader(token_indexers=token_indexer, tokenizer=tokenizer, add_numeric_data=False) train_instances = reader.read(train_data_file_path) validation_instances = reader.read(validation_data_file_path) vocab = Vocabulary() # TODO: change this if necessary # batch_size should be: 10 or 9 depends on the input # and not shuffle so all the data of the same pair will be in the same batch iterator = BasicIterator( batch_size=batch_size) # , instances_per_epoch=10) # sorting_keys=[('sequence_review', 'list_num_tokens')]) iterator.index_with(vocab) options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # TODO: check the output of this # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=2) # word_embeddings = elmo_embedder elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) review_attention_layer = models.AttentionSoftMaxLayer( BilinearMatrixAttention(word_embeddings.get_output_dim(), word_embeddings.get_output_dim())) seq_attention_layer = models.AttentionSoftMaxLayer( DotProductMatrixAttention()) feed_forward = FeedForward(input_dim=batch_size, num_layers=2, hidden_dims=[batch_size, 1], activations=ReLU(), dropout=[0.2, 0.0]) fc_review_rep = FeedForward(input_dim=124, num_layers=1, hidden_dims=[10], activations=ReLU()) criterion = nn.MSELoss() metrics_dict = { 'mean_absolute_error': MeanAbsoluteError(), } model = models.BasicTextModel( word_embedding=word_embeddings, review_representation_layer=review_attention_layer, seq_representation_layer=seq_attention_layer, vocab=vocab, criterion=criterion, metrics_dict=metrics_dict, classifier_feedforward=feed_forward, fc_review_rep=fc_review_rep) optimizer = optim.Adam(model.parameters(), lr=0.1) num_epochs = 2 run_log_directory = utils.set_folder( datetime.now().strftime( f'{model_name}_{num_epochs}_epochs_%d_%m_%Y_%H_%M_%S'), 'logs') if not os.path.exists(run_log_directory): os.makedirs(run_log_directory) trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_instances, validation_dataset=validation_instances, num_epochs=num_epochs, shuffle=False, serialization_dir=run_log_directory, patience=10, histogram_interval=10, ) model_dict = trainer.train() print(f'{model_name}: evaluation measures are:') for key, value in model_dict.items(): print(f'{key}: {value}')
def train_valid_base_text_decision_results_ep_model( model_name: str, single_round_label: bool, use_only_prev_round: bool, train_data_file_name: str, validation_data_file_name: str, no_history: bool = False): """ This function train and validate model that use texts and numbers. :param: model_name: the full model name :param single_round_label: the label to use: single round of total payoff :param use_only_prev_round: if to use all the history or only the previous round :param train_data_file_name: the name of the train_data to use :param validation_data_file_name: the name of the validation_data to use :param no_history: if we don't want to use any history data :return: """ token_indexer = {"tokens": ELMoTokenCharactersIndexer()} def tokenizer(x: str): return [ w.text for w in SpacyWordSplitter(language='en_core_web_sm', pos_tags=False).split_words(x) ] reader = TextExpDataSetReader(token_indexers=token_indexer, tokenizer=tokenizer, add_numeric_data=True, use_only_prev_round=use_only_prev_round, single_round_label=single_round_label, three_losses=True, no_history=no_history) train_data_file_inner_path = os.path.join(data_directory, train_data_file_name) validation_data_file_inner_path = os.path.join(data_directory, validation_data_file_name) train_instances = reader.read(train_data_file_inner_path) validation_instances = reader.read(validation_data_file_inner_path) vocab = Vocabulary() # TODO: change this if necessary # batch_size should be: 10 or 9 depends on the input # and not shuffle so all the data of the same pair will be in the same batch iterator = BasicIterator(batch_size=9) # , instances_per_epoch=10) # sorting_keys=[('sequence_review', 'list_num_tokens')]) iterator.index_with(vocab) options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_options.json' weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/' \ 'elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5' # TODO: check the output of this # elmo_embedder = Elmo(options_file, weight_file, num_output_representations=2) # word_embeddings = elmo_embedder elmo_embedder = ElmoTokenEmbedder(options_file, weight_file) word_embeddings = BasicTextFieldEmbedder({"tokens": elmo_embedder}) review_attention_layer =\ models.AttentionSoftMaxLayer(BilinearMatrixAttention(word_embeddings.get_output_dim(), word_embeddings.get_output_dim())) seq_attention_layer = models.AttentionSoftMaxLayer( DotProductMatrixAttention()) fc_review_rep_output_dim = reader.max_tokens_len fc_review_rep = FeedForward(input_dim=reader.max_tokens_len, num_layers=1, hidden_dims=[fc_review_rep_output_dim], activations=ReLU()) # seq_attention_layer = FeedForward(input_dim=) # numbers_lstm: Seq2VecEncoder = PytorchSeq2VecWrapper(nn.LSTM(2, 10, bidirectional=True, batch_first=True)) # the shape of the flatten data rep feed_forward_input_dim = reader.max_seq_len * (fc_review_rep_output_dim + reader.number_length) feed_forward_classification = FeedForward(input_dim=feed_forward_input_dim, num_layers=1, hidden_dims=[2], activations=ReLU(), dropout=[0.0]) feed_forward_regression = FeedForward(input_dim=feed_forward_input_dim, num_layers=1, hidden_dims=[1], activations=ReLU(), dropout=[0.0]) criterion_classification = nn.BCEWithLogitsLoss() criterion_regression = nn.MSELoss() metrics_dict = { "accuracy": CategoricalAccuracy(), # 'auc': Auc(), # 'F1measure': F1Measure(positive_label=1), } model = models.BasicTextDecisionResultModel( word_embedding=word_embeddings, review_representation_layer=review_attention_layer, seq_representation_layer=seq_attention_layer, vocab=vocab, classifier_feedforward_classification=feed_forward_classification, classifier_feedforward_regression=feed_forward_regression, fc_review_rep=fc_review_rep, criterion_classification=criterion_classification, criterion_regression=criterion_regression, metrics_dict=metrics_dict, add_numbers=True, max_tokens_len=reader.max_tokens_len, ) optimizer = optim.Adam(model.parameters(), lr=0.1) num_epochs = 2 run_log_directory = utils.set_folder( datetime.now().strftime( f'{model_name}_{num_epochs}_epochs_%d_%m_%Y_%H_%M_%S'), 'logs') trainer = Trainer( model=model, optimizer=optimizer, iterator=iterator, train_dataset=train_instances, validation_dataset=validation_instances, num_epochs=num_epochs, shuffle=False, serialization_dir=run_log_directory, patience=10, histogram_interval=10, ) model_dict = trainer.train() print(f'{model_name}: evaluation measures are:') for key, value in model_dict.items(): if 'accuracy' in key: value = value * 100 print(f'{key}: {value}') # save the model predictions model.predictions.to_csv(os.path.join(run_log_directory, 'predictions.csv'))
def __init__(self, question_encoding_dim: int, passage_encoding_dim: int, passage_attention_to_span: Seq2SeqEncoder, question_attention_to_span: Seq2SeqEncoder, passage_attention_to_count: Seq2SeqEncoder, passage_count_predictor=None, passage_count_hidden2logits=None, dropout: float = 0.0): super().__init__() self.num_counts = 10 self.passage_attention_scalingvals = [1, 2, 5, 10] self.passage_attention_to_span = passage_attention_to_span self.passage_startend_predictor = torch.nn.Linear( self.passage_attention_to_span.get_output_dim(), 2) self.question_attention_to_span = question_attention_to_span self.question_startend_predictor = torch.nn.Linear( self.question_attention_to_span.get_output_dim(), 2) self.passage_attention_to_count = passage_attention_to_count # self.passage_count_predictor = torch.nn.Linear(self.passage_attention_to_count.get_output_dim(), # self.num_counts) self.passage_count_predictor = passage_count_predictor # Linear from self.passage_attention_to_count.output_dim --> 1 self.passage_count_hidden2logits = passage_count_hidden2logits self.dotprod_matrix_attn = DotProductMatrixAttention() self.filter_matrix_attention = LinearMatrixAttention( tensor_1_dim=question_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # We will sum the passage-token-repr to the weighted-q-repr - to use x*y combination self.relocate_matrix_attention = LinearMatrixAttention( tensor_1_dim=passage_encoding_dim, tensor_2_dim=passage_encoding_dim, combination="x,y,x*y") # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_start_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) self.passage_to_end_date_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) # This computes a passage_to_passage attention, hopefully, for each token, putting a weight on date tokens # that are related to it. self.passage_to_num_attention: MatrixAttention = BilinearMatrixAttention( matrix_1_dim=passage_encoding_dim, matrix_2_dim=passage_encoding_dim) if dropout > 0: self._dropout = torch.nn.Dropout(p=dropout) else: self._dropout = lambda x: x
def __init__(self, vocab: Vocabulary, text_encoder: Seq2SeqEncoder, word_embedder: TextFieldEmbedder, enable_training_log: bool = False, inp_drop_rate: float = 0.2, out_drop_rate: float = 0.2, loss_weights: List = (0.2, 0.4, 0.4), super_mode: str = 'before', backbone: str = 'unet', unet_down_channel: int = 256, feature_sel: int = 127): super(UnifiedFollowUp, self).__init__(vocab) self.text_encoder = text_encoder self.word_embedder = word_embedder """ Define model arch choices """ self.backbone = backbone # input dropout if inp_drop_rate > 0: self.var_inp_dropout = InputVariationalDropout(p=inp_drop_rate) else: self.var_inp_dropout = lambda x: x # output dropout if out_drop_rate > 0: self.var_out_dropout = InputVariationalDropout(p=out_drop_rate) else: self.var_out_dropout = lambda x: x self.hidden_size = text_encoder.get_output_dim() // 2 if text_encoder.is_bidirectional() \ else text_encoder.get_output_dim() self.output_size = text_encoder.get_output_dim() # ele -> element wise multiply # dot -> dot product # cos -> cosine similarity # emb_dot -> embedding dot product # emb_cos -> embedding cosine similarity # linear -> linear similarity # bilinear -> bilinear similarity feature_sel = feature_sel sel_arr = "{0:07b}".format(int(feature_sel)) nni_choices = ['ele', 'dot', 'cos', 'emb_dot', 'emb_cos', 'linear', 'bilinear'] self.segment_choices = [nni_choices[i] for i in range(7) if sel_arr[i] == '1'] # if expand bi-direction, we will regard forward/backward as two channels self.expand_bidir = False self.similar_function = ModuleDict({ 'ele': ElementWiseMatrixAttention(), 'dot': DotProductMatrixAttention(), 'cos': CosineMatrixAttention(), 'emb_dot': DotProductMatrixAttention(), 'emb_cos': CosineMatrixAttention(), 'bilinear': BilinearMatrixAttention(matrix_1_dim=self.output_size, matrix_2_dim=self.output_size), 'linear': LinearMatrixAttention(tensor_1_dim=self.output_size, tensor_2_dim=self.output_size) }) self.attn_channel = 0 for choice in self.segment_choices: if choice == 'ele': self.attn_channel += self.output_size elif choice in ['dot', 'cos', 'emb_dot', 'emb_cos', 'bilinear', 'linear']: if self.expand_bidir: self.attn_channel += 2 else: self.attn_channel += 1 self.class_mapping: Dict[str, int] = get_class_mapping(super_mode=super_mode) # Here we have two choices now, one is MLP, and another is UNet if self.backbone == 'unet': self.segmentation_net = AttentionUNet(input_channels=self.attn_channel, class_number=len(self.class_mapping.keys()), down_channel=unet_down_channel) else: raise Exception("Currently we do not support for other arches.") class_zero_weight = loss_weights[0] class_one_weight = loss_weights[1] self.register_buffer('weight_tensor', torch.tensor([class_zero_weight, class_one_weight, 1 - class_zero_weight - class_one_weight])) self.loss = nn.CrossEntropyLoss(ignore_index=-1, weight=self.weight_tensor) # initialize metrics measurement self.metrics = {'ROUGE': BatchAverage(), '_ROUGE1': BatchAverage(), '_ROUGE2': BatchAverage(), # TODO: You can speed up the code by disable BLEU since # the corpus-based BLEU metric is much time-consuming. 'BLEU': CorpusBLEUMetric(), 'EM': BatchAverage(), 'F1': FScoreMetric(prefix="1"), 'F2': FScoreMetric(prefix="2"), 'F3': FScoreMetric(prefix="3")} parameter_num = count_parameters(self) print(parameter_num) self.min_width = 8 self.min_height = 8 self.enable_training_log = enable_training_log
def __init__( self, vocab: Vocabulary, span_encoder: Seq2SeqEncoder, reasoning_encoder: Seq2SeqEncoder, input_dropout: float = 0.1, hidden_dim_maxpool: int = 512, class_embs: bool = True, reasoning_use_obj: bool = True, reasoning_use_answer: bool = True, reasoning_use_question: bool = True, pool_reasoning: bool = True, pool_answer: bool = True, pool_question: bool = False, preload_path: str = "source_model.th", initializer: InitializerApplicator = InitializerApplicator(), ): super(AttentionQA, self).__init__(vocab) self.detector = SimpleDetector(pretrained=True, average_pool=True, semantic=class_embs, final_dim=512) ################################################################################################### self.rnn_input_dropout = TimeDistributed( InputVariationalDropout( input_dropout)) if input_dropout > 0 else None self.span_encoder = TimeDistributed(span_encoder) self.reasoning_encoder = TimeDistributed(reasoning_encoder) self.BiLSTM = TimeDistributed(MYLSTM(1280, 512, 256)) self.source_encoder = TimeDistributed(source_LSTM(768, 256)) self.span_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.span_attention_2 = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=span_encoder.get_output_dim(), ) self.obj_attention = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self.obj_attention_2 = BilinearMatrixAttention( matrix_1_dim=span_encoder.get_output_dim(), matrix_2_dim=self.detector.final_dim, ) self._matrix_attention = DotProductMatrixAttention() #self._matrix_attention = MatrixAttention(similarity_function) self.reasoning_use_obj = reasoning_use_obj self.reasoning_use_answer = reasoning_use_answer self.reasoning_use_question = reasoning_use_question self.pool_reasoning = pool_reasoning self.pool_answer = pool_answer self.pool_question = pool_question dim = sum([ d for d, to_pool in [( reasoning_encoder.get_output_dim(), self.pool_reasoning ), (span_encoder.get_output_dim(), self.pool_answer ), (span_encoder.get_output_dim(), self.pool_question)] if to_pool ]) self.final_mlp = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self.final_mlp_2 = torch.nn.Sequential( torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(dim, hidden_dim_maxpool), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), torch.nn.Linear(hidden_dim_maxpool, 1), ) self.answer_BN = torch.nn.Sequential(BatchNorm1d(512)) self.question_BN = torch.nn.Sequential(BatchNorm1d(512)) self.source_answer_BN = torch.nn.Sequential(BatchNorm1d(512)) self.source_question_BN = torch.nn.Sequential(BatchNorm1d(512)) self.image_BN = BatchNorm1d(512) self.final_BN = torch.nn.Sequential(BatchNorm1d(512)) self.final_mlp_linear = torch.nn.Sequential(torch.nn.Linear(512, 1)) self.final_mlp_pool = torch.nn.Sequential( torch.nn.Linear(2560, 512), torch.nn.ReLU(inplace=True), torch.nn.Dropout(input_dropout, inplace=False), ) self._accuracy = CategoricalAccuracy() self._loss = torch.nn.CrossEntropyLoss() initializer(self) if preload_path is not None: logger.info("Preloading!") preload = torch.load(preload_path) own_state = self.state_dict() for name, param in preload.items(): #if name[0:8] == "_encoder": # suffix = "._module."+name[9:] # logger.info("preload paramter {}".format("span_encoder"+suffix)) # own_state["span_encoder"+suffix].copy_(param) #新引入的source_encoder if name[0:4] == "LSTM": suffix = "._module" + name[4:] logger.info("preload paramter {}".format("source_encoder" + suffix)) own_state["source_encoder" + suffix].copy_(param)