def __init__(self, sentence_encoder: Seq2VecEncoder, doc_encoder: Seq2VecEncoder, query_encoder: Seq2VecEncoder, use_encoded: bool = False, scorer: Optional[FeedForward] = None, sentence_attention: Optional[Attention] = None, document_attention: Optional[Attention] = None) -> None: super(Seq2VecSentenceScorer, self).__init__() self.sentence_encoder = sentence_encoder self.doc_encoder = doc_encoder self.query_encoder = query_encoder self.use_encoded = use_encoded self.sentence_attention = sentence_attention self.document_attention = document_attention # get the dimensions for the scorer and for sanity checking q_dim = self.query_encoder.get_output_dim() d_dim = self.doc_encoder.get_output_dim() input_dim = (q_dim + d_dim) if use_encoded: input_dim *= 2 # set up the scorer if scorer is None: scorer = FeedForward( input_dim=input_dim, num_layers=1, hidden_dims=1, activations=Activation.by_name('linear')(), dropout=0.) self.query_transformer = FeedForward( input_dim=q_dim, num_layers=1, hidden_dims=q_dim, activations=Activation.by_name('tanh')(), dropout=0.2) self.scorer = scorer # assertions to ensure our shapes match our assumptions assert q_dim == d_dim assert self.scorer.get_output_dim() == 1 assert self.scorer.get_input_dim() == input_dim
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, word_embeddings: TextFieldEmbedder, bin_count: int): super(DRMM, self).__init__() self.word_embeddings = word_embeddings self.cosine_module = CosineMatrixAttention() self.bin_count = bin_count self.matching_classifier = FeedForward( input_dim=bin_count, num_layers=2, hidden_dims=[bin_count, 1], activations=[ Activation.by_name('tanh')(), Activation.by_name('tanh')() ]) self.query_gate = FeedForward( input_dim=self.word_embeddings.get_output_dim(), num_layers=2, hidden_dims=[self.word_embeddings.get_output_dim(), 1], activations=[ Activation.by_name('tanh')(), Activation.by_name('tanh')() ]) self.query_softmax = MaskedSoftmax()
def __init__( self, input_dim, hidden_dim, projection_dim, feedforward_hidden_dim, num_layers, num_attention_heads, use_positional_encoding=True, dropout_prob=0.2, ): super(MaskedStackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers = [] self._feedfoward_layers = [] self._layer_norm_layers = [] self._feed_forward_layer_norm_layers = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name("relu")(), Activation.by_name("linear")() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.add_module("feedforward_{i}".format(feedfoward)) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_input_dim()) self.add_module( "feedforward_layer_norm_{i}".format(feedforward_layer_norm)) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MaskedMultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, ) self.add_module("self_attention_{i}".format(self_attention)) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_input_dim()) self.add_module("layer_norm_{i}".format(layer_norm)) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = torch.nn.Dropout(dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim() self._output_layer_norm = LayerNorm(self._output_dim)
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward( feedfoward_input_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__(self, input_dim: int, code_dim: int): super().__init__() self._code_dim = code_dim self._mu_linear = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=code_dim, activations=lambda x: x) self._logvar_linear = FeedForward(input_dim=input_dim, num_layers=1, hidden_dims=code_dim, activations=lambda x: x)
def __init__(self, input_dim: int, hidden_dim: int, projection_dim: int, feedforward_hidden_dim: int, num_layers: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, residual_dropout_prob: float = 0.2, attention_dropout_prob: float = 0.1) -> None: super(StackedSelfAttentionEncoder, self).__init__() self._use_positional_encoding = use_positional_encoding self._attention_layers: List[MultiHeadSelfAttention] = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim = input_dim for i in range(num_layers): feedfoward = FeedForward(feedfoward_input_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) self_attention = MultiHeadSelfAttention(num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=projection_dim, values_dim=projection_dim, attention_dropout_prob=attention_dropout_prob) self.add_module(f"self_attention_{i}", self_attention) self._attention_layers.append(self_attention) layer_norm = LayerNorm(self_attention.get_output_dim()) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(residual_dropout_prob) self._input_dim = input_dim self._output_dim = self._attention_layers[-1].get_output_dim()
def __init__( self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0, ) -> None: super().__init__() check_dimensions_match(input_dim, hidden_dim, "input_dim", "hidden_dim") self._use_positional_encoding = use_positional_encoding self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)] ) self._conv_layers = torch.nn.ModuleList() for _ in range(num_convs): padding = torch.nn.ConstantPad1d( (conv_kernel_size // 2, (conv_kernel_size - 1) // 2), 0 ) depthwise_conv = torch.nn.Conv1d( hidden_dim, hidden_dim, conv_kernel_size, groups=hidden_dim ) pointwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, 1) self._conv_layers.append( torch.nn.Sequential( padding, depthwise_conv, pointwise_conv, Activation.by_name("relu")() ) ) self.attention_norm_layer = LayerNorm(hidden_dim) self.attention_layer = MultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob, ) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[Activation.by_name("relu")(), Activation.by_name("linear")()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob, ) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout(layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim
def __init__(self, input_dim: int, summary_dim: int, feedforward: FeedForward): super().__init__() self.input_dim = input_dim self.summary_dim = summary_dim self.feedforward = feedforward # Make sure that the input dimension matches the input/stack. assert input_dim + summary_dim == feedforward.get_input_dim()
def __init__(self, input_dim: int, code_dim: int, kappa: int): super().__init__() self._code_dim = code_dim self._kappa = kappa self._mu_linear = FeedForward( input_dim=input_dim, num_layers=1, hidden_dims=code_dim, activations=lambda x: x / x.norm(dim=-1, keepdim=True))
def initialize_network(self, n_tags: int, sense_dim: int, rep_dim: int): self.n_tags = n_tags self._arc_tag_arg_enc = Linear(rep_dim, self.hidden_dim) if self.use_predicate_rep: self._arc_tag_pred_enc = Linear(rep_dim, self.hidden_dim) if self.graph_type != 2: self._arc_tag_sense_enc = Linear(sense_dim, self.hidden_dim) if self.graph_type == 1: self._arc_tag_tags_enc = Linear(n_tags + 1, self.hidden_dim) elif self.graph_type == 2: self._arc_tag_tags_enc = Linear(n_tags + 1, self.hidden_dim) else: self._arc_tag_tags_enc = Linear(2 * n_tags + 1, self.hidden_dim) if self.weight_tie: self.arc_tag_refiner = lambda x: x.matmul(self._arc_tag_tags_enc. weight[:, :n_tags + 1]) if self.graph_type != 2: self.predicate_linear = Linear(rep_dim + n_tags + sense_dim, self.hidden_dim) else: self.predicate_linear = Linear(rep_dim + sense_dim, self.hidden_dim) self.predicte_refiner = lambda x: self._dropout(self.activation(self.predicate_linear(x)))\ .matmul(self.predicate_linear.weight[:,:sense_dim]) else: self.arc_tag_refiner = FeedForward(self.hidden_dim, 1, n_tags + 1, Activation.by_name("linear")(), dropout=self.dropout) self.predicte_refiner = FeedForward( rep_dim + n_tags + sense_dim, 2, [self.hidden_dim] + [sense_dim], [self.activation] + [Activation.by_name("linear")()], dropout=self.dropout)
def __init__(self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0) -> None: super().__init__() check_dimensions_match(input_dim, hidden_dim, 'input_dim', 'hidden_dim') self._use_positional_encoding = use_positional_encoding self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)]) self._conv_layers = torch.nn.ModuleList([ DepthwiseSeparableConv(hidden_dim, hidden_dim, conv_kernel_size, activation="relu", dim=1) for _ in range(num_convs) ]) self.attention_norm_layer = LayerNorm(hidden_dim) self.attention_layer = MemoryEfficientMultiHeadSelfAttention( num_heads=num_attention_heads, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout( layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim
def __init__(self, hdim: int = 768, nlayers: int = 2, dropout_prob: int = 0.1): super(GCNNet, self).__init__() # self.gcns = nn.ModuleList([GCN(hdim, hdim, F.relu) for i in range(nlayers)]) self._gcn_layers = [] self._feedfoward_layers: List[FeedForward] = [] self._layer_norm_layers: List[LayerNorm] = [] self._feed_forward_layer_norm_layers: List[LayerNorm] = [] feedfoward_input_dim, feedforward_hidden_dim, hidden_dim = hdim, hdim, hdim for i in range(nlayers): feedfoward = FeedForward(feedfoward_input_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) # Note: Please use `ModuleList` in new code. It provides better # support for running on multiple GPUs. We've kept `add_module` here # solely for backwards compatibility with existing serialized models. self.add_module(f"feedforward_{i}", feedfoward) self._feedfoward_layers.append(feedfoward) feedforward_layer_norm = LayerNorm(feedfoward.get_output_dim()) self.add_module(f"feedforward_layer_norm_{i}", feedforward_layer_norm) self._feed_forward_layer_norm_layers.append(feedforward_layer_norm) gcn = GCN(hdim, hdim, F.relu) self.add_module(f"gcn_{i}", gcn) self._gcn_layers.append(gcn) layer_norm = LayerNorm(hdim) self.add_module(f"layer_norm_{i}", layer_norm) self._layer_norm_layers.append(layer_norm) feedfoward_input_dim = hidden_dim self.dropout = Dropout(dropout_prob) self._input_dim = hdim self._output_dim = hdim
def __init__(self, vocab: Vocabulary, title_embedder: TextFieldEmbedder, abstract_embedder: TextFieldEmbedder, dense_dim=75) -> None: super().__init__(vocab) self.title_embedder = title_embedder self.abstract_embedder = abstract_embedder self.intermediate_dim = 6 self.n_layers = 3 self.layer_dims = [dense_dim for i in range(self.n_layers - 1)] self.layer_dims.append(1) self.activations = [ Activation.by_name("elu")(), Activation.by_name("elu")(), Activation.by_name("sigmoid")() ] self.layers = FeedForward(input_dim=self.intermediate_dim, num_layers=self.n_layers, hidden_dims=self.layer_dims, activations=self.activations)
def __init__(self, vocab, embed_dim: int, word_encoder: Seq2SeqEncoder, sent_encoder: Seq2SeqEncoder, word_attn: Attention, sent_attn: Attention): super().__init__(vocab) self._vocab = vocab self._embed = Embedding(self._vocab.get_vocab_size('tokens'), embed_dim) self._word_rnn = word_encoder self._sent_rnn = sent_encoder word_output_dim = self._word_rnn.get_output_dim() sent_output_dim = self._sent_rnn.get_output_dim() self._word_proj = FeedForward(word_output_dim, 1, word_output_dim, nn.Tanh()) self._word_rand = nn.Parameter(torch.rand(word_output_dim)) self._word_attn = word_attn self._sent_proj = FeedForward(sent_output_dim, 1, sent_output_dim, nn.Tanh()) self._sent_rand = nn.Parameter(torch.rand(sent_output_dim)) self._sent_attn = sent_attn self._doc_project = nn.Linear(sent_output_dim, self._vocab.get_vocab_size('labels')) self._crit = nn.CrossEntropyLoss() self._acc = CategoricalAccuracy()
def __init__(self, indexer: DocumentIndexer, embedding_matrix: torch.Tensor, dims=None): super(SampleEncoder, self).__init__() if dims is None: dims = default_dims self.dims = dims words_emb_size = embedding_matrix.size(1) self.word_embedder = nn.Embedding.from_pretrained(embedding_matrix) self.word_dropout = nn.Dropout(dims['dropout_input']) self.char_embedder = nn.Embedding(len(indexer.char_vocab), dims['char_emb_size']) self.case_embedder = nn.Embedding(len(indexer.case_vocab), dims['case_emb_size']) self.pos_embedder = nn.Embedding(len(indexer.pos_vocab), dims['pos_emb_size']) self.ner_embedder = nn.Embedding(len(indexer.ner_vocab), dims['ner_emb_size']) self.char_encoder = PytorchSeq2VecWrapper( nn.LSTM(dims['char_emb_size'], dims['chars_hidden'], batch_first=True, bidirectional=True)) total_emb_size = words_emb_size + dims['case_emb_size'] + 2 * dims['chars_hidden'] \ + dims['pos_emb_size'] + dims['ner_emb_size'] self.encoder = PytorchSeq2SeqWrapper( nn.LSTM(total_emb_size, dims['hidden'], batch_first=True, bidirectional=True, num_layers=2)) self.sent_dropout = nn.Dropout(dims['dropout_lstm']) self.feedforward = FeedForward(2 * dims['hidden'], 1, dims['feedforward'], activations=nn.Tanh()) self.attention = nn.Linear(2 * dims['hidden'], dims['attention_dim']) self.scores = nn.Linear(dims['attention_dim'], 1) self.hidden2tag = nn.Linear(2 * dims['hidden'], len(indexer.relation_type_vocab)) self.out_dropout = nn.Dropout(dims['dropout_lstm'])
def __init__(self, input_dims: List[int], num_layers: int, hidden_dims: Union[int, List[int]], activations='relu'): super(GCN_layers, self).__init__() if not isinstance(hidden_dims, list): hidden_dims = [hidden_dims] * num_layers # TODO remove hard code relu activations = [torch.nn.functional.tanh] * num_layers assert len(input_dims) == len(hidden_dims) == len(activations) == num_layers gcn_layers = [] for layer_input_dim, layer_output_dim, activate in zip(input_dims, hidden_dims, activations): gcn_layers.append(GCN(layer_input_dim, layer_output_dim, activate)) self.layers = nn.ModuleList(gcn_layers) self._output_dim = hidden_dims[-1] self.input_dim = input_dims[0] self.ln = LayerNorm(hidden_dims[0]) self._mlp = FeedForward(hidden_dims[0], 1, hidden_dims[0], torch.nn.functional.sigmoid)
def __init__(self, vocab: Vocabulary, query_field_embedder: TextFieldEmbedder, doc_field_embedder: TextFieldEmbedder, scorer: Scorer, validation_metrics: Dict[str, Metric], temperature: float = 15.0, alpha: float = 0.8, ranking_loss: bool = False, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, idf_embedder: Optional[TextFieldEmbedder] = None, dropout: float = 0.) -> None: super(LeToRWrapper, self).__init__(vocab, regularizer) self.embedder = doc_field_embedder self.idf_embedder = idf_embedder self.final_scorer = FeedForward(2, 1, 1, lambda x: x) self.scorer = scorer self.initializer = initializer self.regularizer = regularizer self.metrics = copy.deepcopy(validation_metrics) self.metrics.update({'accuracy': CategoricalAccuracy()}) self.training_metrics = { True: ['accuracy'], False: validation_metrics.keys() } self.temperature = temperature self.kd_alpha = alpha # self.ranking_loss = ranking_loss # if self.ranking_loss: #self.loss = nn.MarginRankingLoss(margin=1.0) # else: self.loss = nn.CrossEntropyLoss() initializer(self)
def __init__(self, params: Params, vocab: Vocabulary) -> None: super().__init__(vocab=vocab) enc_hidden_dim = params.pop_int('enc_hidden_dim', 300) disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200) disc_num_layers = params.pop_int('disc_num_layers', 1) emb_dropout = params.pop_float('emb_dropout', 0.0) disc_dropout = params.pop_float('disc_dropout', 0.0) l2_weight = params.pop_float('l2_weight', 0.0) self.emb_dropout = nn.Dropout(emb_dropout) self.disc_dropout = nn.Dropout(disc_dropout) self._l2_weight = l2_weight self._token_embedder = Embedding.from_params( vocab=vocab, params=params.pop('token_embedder')) self._discriminator_encoder = PytorchSeq2VecWrapper( nn.LSTM(input_size=self._token_embedder.get_output_dim(), hidden_size=enc_hidden_dim, batch_first=True)) self._discriminator = FeedForward( input_dim=4 * self._discriminator_encoder.get_output_dim(), hidden_dims=[disc_hidden_dim] * disc_num_layers + [self._NUM_LABELS], num_layers=disc_num_layers + 1, activations=[Activation.by_name('relu')()] * disc_num_layers + [Activation.by_name('linear')()]) # Metrics self._metrics = { 'labeled': { 'discriminator_entropy': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'loss': ScalarMetric() } }
def load_decomposable_attention_elmo_softmax_model(): NEGATIVE_PERCENTAGE = 100 # EMBEDDING_TYPE = "" # LOSS_TYPE = "" # NLL # LOSS_TYPE = "_nll" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo reader = QuestionResponseSoftmaxReader(token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params(vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: model.load_state_dict(torch.load(f, map_location='cuda:0')) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor(model, dataset_reader=reader) return model, predictor
seq2seq_encoder = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) # In[1152]: classifier_params = Params({ "input_dim": HIDDEN_DIM * 2, "num_layers": 2, "hidden_dims": [50, 3], "activations": ["sigmoid", "linear"], "dropout": [0.2, 0.0] }) # In[1153]: classifier_feedforward = FeedForward.from_params(classifier_params) # In[1154]: parse_label = { 'word': torch.LongTensor([[1, 0, 3, 7, 2, 9, 4], [0, 0, 5, 0, 0, 0, 4]]) } embedded_parse_label = field_type2embedder['word'](parse_label) # In[1155]: feature_mask = util.get_text_field_mask(parse_label) # In[1156]: encoded_feature = encoder(embedded_parse_label, feature_mask)
def __init__(self, params: Params, vocab: Vocabulary) -> None: super().__init__(vocab=vocab) disc_hidden_dim = params.pop_int('disc_hidden_dim', 1200) disc_num_layers = params.pop_int('disc_num_layers', 1) code_dist_type = params.pop_choice('code_dist_type', ['gaussian', 'vmf'], default_to_first_choice=True) code_dim = params.pop_int('code_dim', 500) emb_dropout = params.pop_float('emb_dropout', 0.0) disc_dropout = params.pop_float('disc_dropout', 0.0) latent_dropout = params.pop_float('latent_dropout', 0.0) l2_weight = params.pop_float('l2_weight', 0.0) self.emb_dropout = nn.Dropout(emb_dropout) self.disc_dropout = nn.Dropout(disc_dropout) self.latent_dropout = nn.Dropout(latent_dropout) self._l2_weight = l2_weight self._token_embedder = Embedding.from_params( vocab=vocab, params=params.pop('token_embedder')) self._encoder = nn.Sequential( nn.Conv1d(in_channels=300, out_channels=300, kernel_size=5, stride=2), nn.Conv1d(in_channels=300, out_channels=600, kernel_size=5, stride=2), nn.Conv1d(in_channels=600, out_channels=500, kernel_size=5, stride=2)) self._generator = nn.Sequential( nn.ConvTranspose1d(in_channels=500, out_channels=600, kernel_size=5, stride=2), nn.ReLU(), nn.ConvTranspose1d(in_channels=600, out_channels=300, kernel_size=5, stride=2), nn.ReLU(), nn.ConvTranspose1d(in_channels=300, out_channels=300, kernel_size=5, stride=2), nn.ReLU()) self._generator_projector = nn.Linear( in_features=300, out_features=vocab.get_vocab_size(), bias=False) self._generator_projector.weight = self._token_embedder.weight if code_dist_type == 'vmf': vmf_kappa = params.pop_int('vmf_kappa', 150) self._code_generator = VmfCodeGenerator(input_dim=500, code_dim=code_dim, kappa=vmf_kappa) elif code_dist_type == 'gaussian': self._code_generator = GaussianCodeGenerator(input_dim=500, code_dim=code_dim) else: raise ValueError('Unknown code_dist_type') self._discriminator = FeedForward( input_dim=4 * self._code_generator.get_output_dim(), hidden_dims=[disc_hidden_dim] * disc_num_layers + [self._NUM_LABELS], num_layers=disc_num_layers + 1, activations=[Activation.by_name('relu')()] * disc_num_layers + [Activation.by_name('linear')()], dropout=disc_dropout) self._kl_weight = 1.0 self._discriminator_weight = params.pop_float('discriminator_weight', 0.1) self._gumbel_temperature = 1.0 # Metrics self._metrics = { 'generator_loss': ScalarMetric(), 'kl_divergence': ScalarMetric(), 'discriminator_accuracy': CategoricalAccuracy(), 'discriminator_loss': ScalarMetric(), 'loss': ScalarMetric() }
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, encoding_in_dim:int, encoding_out_dim:int, modeling_in_dim:int, modeling_out_dim:int, dropout_prob: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, external_number: List[int] = None, answering_abilities: List[str] = None) -> None: super().__init__(vocab, regularizer) #print (vocab) if answering_abilities is None: self.answering_abilities = ["span_extraction", "addition_subtraction", "counting"] else: self.answering_abilities = answering_abilities self.W = torch.nn.Linear(768*2,768) text_embed_dim = text_field_embedder.get_output_dim() self._text_field_embedder = text_field_embedder #self._embedding_proj_layer = torch.nn.Linear(text_embed_dim, encoding_in_dim) """ 为了用于self attention """ if len(self.answering_abilities) > 1: self._answer_ability_predictor = FeedForward(text_embed_dim, activations=[Activation.by_name('relu')(inplace=True), Activation.by_name('linear')()], hidden_dims=[encoding_out_dim, len(self.answering_abilities)], num_layers=2, dropout=dropout_prob) if "span_extraction" in self.answering_abilities: self._span_extraction_index = self.answering_abilities.index("span_extraction") self._span_start_predictor = FeedForward(text_embed_dim, activations=[Activation.by_name('relu')(inplace=True), Activation.by_name('linear')()], hidden_dims=[encoding_out_dim,1], num_layers=2, dropout=dropout_prob) self._span_end_predictor = FeedForward(text_embed_dim , activations=[Activation.by_name('relu')(inplace=True), Activation.by_name('linear')()], hidden_dims=[encoding_out_dim,1], num_layers=2, dropout=dropout_prob) if "addition_subtraction" in self.answering_abilities: self._addition_subtraction_index = self.answering_abilities.index("addition_subtraction") self._number_sign_predictor = FeedForward(text_embed_dim*2, activations=[Activation.by_name('relu')(inplace=True), Activation.by_name('linear')()], hidden_dims=[encoding_out_dim,3], num_layers=2, dropout=dropout_prob) if "counting" in self.answering_abilities: self._counting_index = self.answering_abilities.index("counting") self._count_number_predictor = FeedForward(text_embed_dim, activations=[Activation.by_name('relu')(inplace=True), Activation.by_name('linear')()], hidden_dims=[encoding_out_dim, 10], num_layers=2, dropout=dropout_prob) self._drop_metrics = DropEmAndF1() self._dropout = torch.nn.Dropout(p=dropout_prob) initializer(self)
def save_top_results(process_no, start_index, end_index): print("Starting process {} with start at {} and end at {}".format( process_no, start_index, end_index)) DATA_FOLDER = "train_data" # EMBEDDING_TYPE = "" LOSS_TYPE = "" # NLL LOSS_TYPE = "_mse" # MSE # EMBEDDING_TYPE = "" # EMBEDDING_TYPE = "_glove" # EMBEDDING_TYPE = "_bert" EMBEDDING_TYPE = "_elmo" # EMBEDDING_TYPE = "_elmo_retrained" # EMBEDDING_TYPE = "_elmo_retrained_2" token_indexers = None if EMBEDDING_TYPE == "_elmo" or EMBEDDING_TYPE == "_elmo_retrained" or EMBEDDING_TYPE == "_elmo_retrained_2": token_indexers = {"tokens": ELMoTokenCharactersIndexer()} MAX_BATCH_SIZE = 0 # MAX_BATCH_SIZE = 150 # for bert and elmo # q_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_saved_questions_lexparser_sh.txt") # r_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answers_lexparser_sh.txt") # rules_file = os.path.join("squad_seq2seq_train", "rule_based_system_squad_seq2seq_train_case_sensitive_generated_answer_rules_lexparser_sh.txt") #NOTE: Squad dev test set q_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_saved_questions.txt") r_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answers.txt") rules_file = os.path.join( "squad_seq2seq_dev_moses_tokenized", "rule_based_system_squad_seq2seq_dev_test_generated_answer_rules.txt") reader = QuestionResponseSoftmaxReader(q_file, r_file, token_indexers=token_indexers, max_batch_size=MAX_BATCH_SIZE) glove_embeddings_file = os.path.join("data", "glove", "glove.840B.300d.txt") # RESULTS_DIR = "squad_seq2seq_train2" #NOTE: All other experiments # RESULTS_DIR = "squad_seq2seq_train_moses_tokenized" # make_dir_if_not_exists(RESULTS_DIR) # all_results_save_file = os.path.join(RESULTS_DIR, "squad_seq2seq_train_predictions_start_{}_end_{}.txt".format(start_index, end_index)) #NOTE: Squad dev test set RESULTS_DIR = "squad_seq2seq_dev_moses_tokenized" make_dir_if_not_exists(RESULTS_DIR) all_results_save_file = os.path.join( RESULTS_DIR, "squad_seq2seq_dev_test_predictions_start_{}_end_{}.txt".format( start_index, end_index)) with open(all_results_save_file, "w") as all_writer: print("Testing out model with", EMBEDDING_TYPE, "embeddings") print("Testing out model with", LOSS_TYPE, "loss") # for NEGATIVE_PERCENTAGE in [100,50,20,10,5,1]: for NEGATIVE_PERCENTAGE in [100]: model_file = os.path.join( "saved_softmax_models", "decomposable_attention{}{}_model_{}.th".format( LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) vocabulary_filepath = os.path.join( "saved_softmax_models", "vocabulary{}{}_{}".format(LOSS_TYPE, EMBEDDING_TYPE, NEGATIVE_PERCENTAGE)) print("LOADING VOCABULARY") # Load vocabulary vocab = Vocabulary.from_files(vocabulary_filepath) EMBEDDING_DIM = 300 PROJECT_DIM = 200 DROPOUT = 0.2 NUM_LAYERS = 2 if EMBEDDING_TYPE == "": token_embedding = Embedding( num_embeddings=vocab.get_vocab_size('tokens'), embedding_dim=EMBEDDING_DIM, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_glove": token_embedding = Embedding.from_params( vocab=vocab, params=Params({ 'pretrained_file': glove_embeddings_file, 'embedding_dim': EMBEDDING_DIM, 'projection_dim': PROJECT_DIM, 'trainable': False })) elif EMBEDDING_TYPE == "_elmo": # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x2048_256_2048cnn_1xhighway/elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5" options_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_options.json") weights_file = os.path.join( "data", "elmo", "elmo_2x2048_256_2048cnn_1xhighway_weights.hdf5") # NOTE: using Small size as medium size gave CUDA out of memory error # options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" # weights_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5" # options_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_options.json") # weights_file = os.path.join("data", "elmo", "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_elmo_retrained_2": options_file = os.path.join("data", "bilm-tf", "elmo_retrained", "options_2.json") weights_file = os.path.join("data", "bilm-tf", "elmo_retrained", "weights_2.hdf5") token_embedding = ElmoTokenEmbedder(options_file, weights_file, dropout=DROPOUT, projection_dim=PROJECT_DIM) elif EMBEDDING_TYPE == "_bert": print("Loading bert model") model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder( {"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 }) aggregate_feedforward = FeedForward.from_params(params) model = DecomposableAttentionSoftmax(vocab, word_embeddings, attend_feedforward, similarity_function, compare_feedforward, aggregate_feedforward) print("MODEL CREATED") # Load model state with open(model_file, 'rb') as f: device = torch.device('cpu') model.load_state_dict(torch.load(f, map_location=device)) print("MODEL LOADED!") if torch.cuda.is_available(): # cuda_device = 3 # model = model.cuda(cuda_device) cuda_device = -1 else: cuda_device = -1 predictor = DecomposableAttentionSoftmaxPredictor( model, dataset_reader=reader) # Read test file and get predictions gold = list() predicted_labels = list() probs = list() total_time = avg_time = 0.0 print("Started Testing:", NEGATIVE_PERCENTAGE) # before working on anything just save all the questions and responses in a list all_data = list() examples_count = processed_examples_count = 0 with open(q_file, 'r') as q_reader, open(r_file, "r") as r_reader, open( rules_file, "r") as rule_reader: logger.info("Reading questions from : %s", q_file) logger.info("Reading responses from : %s", r_file) q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() for i, (response, rule) in enumerate(zip(r_reader, rule_reader)): response = response.strip() rule = rule.strip() if response and rule: # get current_answer from response a = get_answer_from_response(response) if not current_qa[1]: current_qa = (q, a) else: # verify if the a is same as the one in current_qa if a != current_qa[1]: # print("answer phrase mismatch!!", current_qa, ":::", a, ":::", response) current_qa = (current_qa[0], a) # print(current_rules_and_responses) # exit() # Add it to the current responses current_rules_and_responses.append((response, rule)) elif len(current_rules_and_responses) > 0: # Create a instance # print(current_qa) # print(current_rules_and_responses) # exit() if rule or response: print("Rule Response mismatch") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() if examples_count < start_index: examples_count += 1 q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) current_qa = (q, "") current_rules_and_responses = list() continue elif examples_count > end_index: break all_data.append( (current_qa, current_rules_and_responses)) try: q = next(q_reader).lower().strip() q = mt.tokenize(q, return_str=True, escape=False) except StopIteration: # previous one was the last question q = "" current_qa = (q, "") current_rules_and_responses = list() examples_count += 1 # if(examples_count%100 == 0): # print(examples_count) else: # Serious Bug print("Serious BUG!!") print(current_qa) print(response) print(rule) print(examples_count) print(i) exit() print("{}:\tFINISHED IO".format(process_no)) examples_count = start_index processed_examples_count = 0 for current_qa, responses_and_rules in all_data: start_time = time.time() # Tokenize and preprocess the responses preprocessed_responses = [ mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False) for response, rule in responses_and_rules ] # predictions = predictor.predict(current_qa[0], [remove_answer_brackets(response) for response, rule in responses_and_rules]) predictions = predictor.predict(current_qa[0], preprocessed_responses) label_probs = predictions["label_probs"] tuples = zip(responses_and_rules, label_probs) sorted_by_score = sorted(tuples, key=lambda tup: tup[1], reverse=True) count = 0 all_writer.write("{}\n".format(current_qa[0])) all_writer.write("{}\n".format(current_qa[1])) for index, ((response, rule), label_prob) in enumerate(sorted_by_score): if index == 3: break all_writer.write("{}\t{}\t{}\t{}\n".format( response, mt.tokenize(remove_answer_brackets(response), return_str=True, escape=False), rule, label_prob)) all_writer.write("\n") all_writer.flush() end_time = time.time() processed_examples_count += 1 examples_count += 1 total_time += end_time - start_time avg_time = total_time / float(processed_examples_count) print( "{}:\ttime to write {} with {} responses is {} secs. {} avg time" .format(process_no, examples_count, len(responses_and_rules), end_time - start_time, avg_time))
def __init__( self, input_dim: int, hidden_dim: int, attention_projection_dim: int, feedforward_hidden_dim: int, num_convs: int, conv_kernel_size: int, num_attention_heads: int, num_semantic_labels: int, replace_zero_semantic_labels_with_per_head_labels: bool = True, use_positional_encoding: bool = True, dropout_prob: float = 0.1, layer_dropout_undecayed_prob: float = 0.1, attention_dropout_prob: float = 0, semantic_integration_mode: str = "projection", semantic_emb_dim: int = 0, use_semantic_views: bool = True, multi_head_attention_batch_computation: bool = False, use_separate_label_embeddings_for_q_and_k: bool = True) -> None: super().__init__() self.return_output_meta_is_supported = True check_dimensions_match(input_dim, hidden_dim, 'input_dim', 'hidden_dim') self._use_positional_encoding = use_positional_encoding self._replace_zero_semantic_labels_with_per_head_labels = replace_zero_semantic_labels_with_per_head_labels self._conv_norm_layers = torch.nn.ModuleList( [LayerNorm(hidden_dim) for _ in range(num_convs)]) self._conv_layers = torch.nn.ModuleList() if semantic_integration_mode not in semantic_integration_mode_supported: raise Exception( "semantic_integration_mode must be in [{0}] but is `{1}`". format(", ".join(semantic_integration_mode_supported), semantic_integration_mode)) self._semantic_integration_mode = semantic_integration_mode self._use_separate_label_embeddings_for_q_and_k = use_separate_label_embeddings_for_q_and_k for _ in range(num_convs): padding = torch.nn.ConstantPad1d( (conv_kernel_size // 2, (conv_kernel_size - 1) // 2), 0) depthwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, conv_kernel_size, groups=hidden_dim) pointwise_conv = torch.nn.Conv1d(hidden_dim, hidden_dim, 1) self._conv_layers.append( torch.nn.Sequential(padding, depthwise_conv, pointwise_conv, Activation.by_name("relu")())) self.attention_norm_layer = LayerNorm(hidden_dim) self.num_semantic_labels = num_semantic_labels self.num_attention_heads = num_attention_heads self.attention_layer = MultiHeadSemanticFlatConcatSelfAttention( num_heads=num_attention_heads, num_semantic_labels=num_semantic_labels, input_dim=hidden_dim, attention_dim=attention_projection_dim, values_dim=attention_projection_dim, attention_dropout_prob=attention_dropout_prob, semantic_integration_mode=semantic_integration_mode, semantic_emb_dim=semantic_emb_dim, use_semantic_views=use_semantic_views, multi_head_attention_batch_computation= multi_head_attention_batch_computation, use_separate_label_embeddings_for_q_and_k= use_separate_label_embeddings_for_q_and_k) self.feedforward_norm_layer = LayerNorm(hidden_dim) self.feedforward = FeedForward( hidden_dim, activations=[ Activation.by_name('relu')(), Activation.by_name('linear')() ], hidden_dims=[feedforward_hidden_dim, hidden_dim], num_layers=2, dropout=dropout_prob) self.dropout = Dropout(dropout_prob) self.residual_with_layer_dropout = ResidualWithLayerDropout( layer_dropout_undecayed_prob) self._input_dim = input_dim self._output_dim = hidden_dim
def __init__(self, context_dim, dec_state_dim, enc_hid_dim, text_field_embedder, aggressive_compression: int = -1, keep_threshold: float = 0.5, abs_board_file="/home/cc/exComp/board.txt", gather='mean', dropout=0.5, dropout_emb=0.2, valid_tmp_path='/scratch/cluster/jcxu/exComp', serilization_name: str = "", vocab=None, elmo: bool = False, elmo_weight: str = "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"): super().__init__() self.use_elmo = elmo self.serilization_name = serilization_name if elmo: from allennlp.modules.elmo import Elmo, batch_to_ids from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper self.vocab = vocab options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = elmo_weight self.elmo = Elmo(options_file, weight_file, 1, dropout=dropout_emb) # print(self.elmo.get_output_dim()) # self.word_emb_dim = text_field_embedder.get_output_dim() # self._context_layer = PytorchSeq2SeqWrapper( # torch.nn.LSTM(self.word_emb_dim + self.elmo.get_output_dim(), self.word_emb_dim, # batch_first=True, bidirectional=True)) self.word_emb_dim = self.elmo.get_output_dim() else: self._text_field_embedder = text_field_embedder self.word_emb_dim = text_field_embedder.get_output_dim() self.XEloss = torch.nn.CrossEntropyLoss(reduction='none') self.device = get_device() # self.rouge_metrics_compression = RougeStrEvaluation(name='cp', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) # self.rouge_metrics_compression_best_possible = RougeStrEvaluation(name='cp_ub', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) self.enc = EncCompression(inp_dim=self.word_emb_dim, hid_dim=enc_hid_dim, gather=gather) # TODO dropout self.aggressive_compression = aggressive_compression self.relu = torch.nn.ReLU() self.attn = NewAttention(enc_dim=self.enc.get_output_dim(), dec_dim=self.enc.get_output_dim_unit() * 2 + dec_state_dim) self.concat_size = self.enc.get_output_dim() + self.enc.get_output_dim_unit() * 2 + dec_state_dim self.valid_tmp_path = valid_tmp_path if self.aggressive_compression < 0: self.XELoss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # self.nn_lin = torch.nn.Linear(self.concat_size, self.concat_size) # self.nn_lin2 = torch.nn.Linear(self.concat_size, 2) self.ff = FeedForward(input_dim=self.concat_size, num_layers=3, hidden_dims=[self.concat_size, self.concat_size, 2], activations=[torch.nn.Tanh(), torch.nn.Tanh(), lambda x: x], dropout=dropout ) # Keep thresold # self.keep_thres = list(np.arange(start=0.2, stop=0.6, step=0.075)) self.keep_thres = [0.0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 1.0] self.rouge_metrics_compression_dict = OrderedDict() for thres in self.keep_thres: self.rouge_metrics_compression_dict["{}".format(thres)] = RougeStrEvaluation(name='cp_{}'.format(thres), path_to_valid=valid_tmp_path, writting_address=valid_tmp_path, serilization_name=serilization_name)
class CompressDecoder(torch.nn.Module): def __init__(self, context_dim, dec_state_dim, enc_hid_dim, text_field_embedder, aggressive_compression: int = -1, keep_threshold: float = 0.5, abs_board_file="/home/cc/exComp/board.txt", gather='mean', dropout=0.5, dropout_emb=0.2, valid_tmp_path='/scratch/cluster/jcxu/exComp', serilization_name: str = "", vocab=None, elmo: bool = False, elmo_weight: str = "elmo_2x1024_128_2048cnn_1xhighway_weights.hdf5"): super().__init__() self.use_elmo = elmo self.serilization_name = serilization_name if elmo: from allennlp.modules.elmo import Elmo, batch_to_ids from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper self.vocab = vocab options_file = "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x1024_128_2048cnn_1xhighway/elmo_2x1024_128_2048cnn_1xhighway_options.json" weight_file = elmo_weight self.elmo = Elmo(options_file, weight_file, 1, dropout=dropout_emb) # print(self.elmo.get_output_dim()) # self.word_emb_dim = text_field_embedder.get_output_dim() # self._context_layer = PytorchSeq2SeqWrapper( # torch.nn.LSTM(self.word_emb_dim + self.elmo.get_output_dim(), self.word_emb_dim, # batch_first=True, bidirectional=True)) self.word_emb_dim = self.elmo.get_output_dim() else: self._text_field_embedder = text_field_embedder self.word_emb_dim = text_field_embedder.get_output_dim() self.XEloss = torch.nn.CrossEntropyLoss(reduction='none') self.device = get_device() # self.rouge_metrics_compression = RougeStrEvaluation(name='cp', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) # self.rouge_metrics_compression_best_possible = RougeStrEvaluation(name='cp_ub', path_to_valid=valid_tmp_path, # writting_address=valid_tmp_path, # serilization_name=serilization_name) self.enc = EncCompression(inp_dim=self.word_emb_dim, hid_dim=enc_hid_dim, gather=gather) # TODO dropout self.aggressive_compression = aggressive_compression self.relu = torch.nn.ReLU() self.attn = NewAttention(enc_dim=self.enc.get_output_dim(), dec_dim=self.enc.get_output_dim_unit() * 2 + dec_state_dim) self.concat_size = self.enc.get_output_dim() + self.enc.get_output_dim_unit() * 2 + dec_state_dim self.valid_tmp_path = valid_tmp_path if self.aggressive_compression < 0: self.XELoss = torch.nn.CrossEntropyLoss(reduction='none', ignore_index=-1) # self.nn_lin = torch.nn.Linear(self.concat_size, self.concat_size) # self.nn_lin2 = torch.nn.Linear(self.concat_size, 2) self.ff = FeedForward(input_dim=self.concat_size, num_layers=3, hidden_dims=[self.concat_size, self.concat_size, 2], activations=[torch.nn.Tanh(), torch.nn.Tanh(), lambda x: x], dropout=dropout ) # Keep thresold # self.keep_thres = list(np.arange(start=0.2, stop=0.6, step=0.075)) self.keep_thres = [0.0, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 1.0] self.rouge_metrics_compression_dict = OrderedDict() for thres in self.keep_thres: self.rouge_metrics_compression_dict["{}".format(thres)] = RougeStrEvaluation(name='cp_{}'.format(thres), path_to_valid=valid_tmp_path, writting_address=valid_tmp_path, serilization_name=serilization_name) def encode_sent_and_span_paral(self, text, # batch, max_sent, max_word text_msk, # batch, max_sent, max_word span, # batch, max_sent_num, max_span_num, max_word sent_idx # batch size ): this_text = two_dim_index_select(text['tokens'], sent_idx) # batch, max_word from allennlp.modules.elmo import batch_to_ids if self.use_elmo: this_text_list: List = this_text.tolist() text_str_list = [] for sample in this_text_list: s = [self.vocab.get_token_from_index(x) for x in sample] text_str_list.append(s) character_ids = batch_to_ids(text_str_list).to(self.device) this_context = self.elmo(character_ids) # print(this_context['elmo_representations'][0].size()) this_context = this_context['elmo_representations'][0] else: this_text = {'tokens': this_text} this_context = self._text_field_embedder(this_text) num_doc, max_word, inp_dim = this_context.size() batch_size = sent_idx.size()[0] assert batch_size == num_doc # text is the original text of the selected sentence. # this_context = two_dim_index_select(context, sent_idx) # batch, max_word, hdim this_context_mask = two_dim_index_select(text_msk, sent_idx) # batch, max_word this_span = two_dim_index_select(span, sent_idx) # batch , nspan, max_word concat_rep_of_compression, \ span_msk, original_sent_rep = self.enc.forward(word_emb=this_context, word_emb_msk=this_context_mask, span=this_span) return concat_rep_of_compression, span_msk, original_sent_rep def encode_sent_and_span(self, text, text_msk, span, batch_idx, sent_idx): context = self._text_field_embedder(text) num_doc, max_sent, max_word, inp_dim = context.size() num_doc_, max_sent_, nspan = span.size()[0:-1] assert num_doc == num_doc_ assert max_sent == max_sent_ this_context = context[batch_idx, sent_idx, :, :].unsqueeze(0) this_span = span[batch_idx, sent_idx, :, :].unsqueeze(0) this_context_mask = text_msk[batch_idx, sent_idx, :].unsqueeze(0) flattened_enc, attn_dist, \ spans_rep, span_msk, score \ = self.enc.forward(word_emb=this_context, word_emb_msk=this_context_mask, span=this_span) return flattened_enc, spans_rep, span_msk # 1, hid*2 1, span num, hid 1, span num def indep_compression_judger(self, reps): # t, batch_size_, max_span_num,self.concat_size timestep, batch_size, max_span_num, dim = reps.size() score = self.ff.forward(reps) # lin_out = self.nn_lin(reps) # activated = torch.sigmoid(lin_out) # score = self.nn_lin2(activated) if random.random() < 0.005: print("score: {}".format(score[0])) return score def get_out_dim(self): return self.concat_size def forward_parallel(self, sent_decoder_states, # t, batch, hdim sent_decoder_outputs_logit, # t, batch document_rep, # batch, hdim text, # batch, max_sent, max_word text_msk, # batch, max_sent, max_word span): # batch, max_sent_num, max_span_num, max_word # Encode compression options given sent emission. # output scores, attn dist, ... t, batch_size, hdim = sent_decoder_states.size() t_, batch_size_ = sent_decoder_outputs_logit.size() # invalid bits are -1 batch, max_sent, max_span_num, max_word = span.size() # assert t == t_ t = min(t, t_) assert batch_size == batch == batch_size_ if self.aggressive_compression > 0: all_attn_dist = torch.zeros((t, batch_size, max_span_num)).to(self.device) all_scores = torch.ones((t, batch_size, max_span_num)).to(self.device) * -100 else: all_attn_dist = None all_scores = None all_reps = torch.zeros((t, batch_size_, max_span_num, self.concat_size), device=self.device) for timestep in range(t): dec_state = sent_decoder_states[timestep] # batch, dim logit = sent_decoder_outputs_logit[timestep] # batch # valid_mask = (logit > 0) positive_logit = self.relu(logit.float()).long() # turn -1 to 0 span_t, span_msk_t, sent_t = self.encode_sent_and_span_paral(text=text, text_msk=text_msk, span=span, sent_idx=positive_logit) # sent_t : batch, sent_dim # span_t: batch, span_num, span_dim # span_msk_t: batch, span_num [[1,1,1,0,0,0], concated_rep_high_level = torch.cat([dec_state, document_rep, sent_t], dim=1) # batch, DIM if self.aggressive_compression > 0: attn_dist, score = self.attn.forward_one_step(enc_state=span_t, dec_state=concated_rep_high_level, enc_mask=span_msk_t.float()) # attn_dist: batch, span num # score: batch, span num # concated_rep: batch, dim ==> batch, 1, dim ==> batch, max_span_num, dim expanded_concated_rep = concated_rep_high_level.unsqueeze(1).expand((batch, max_span_num, -1)) all_reps[timestep, :, :, :] = torch.cat([expanded_concated_rep, span_t], dim=2) if self.aggressive_compression > 0: all_attn_dist[timestep, :, :] = attn_dist all_scores[timestep, :, :] = score return all_attn_dist, all_scores, all_reps def comp_loss_inf_deletion(self, decoder_outputs_logit, # gold label!!!! # span_seq_label, # batch, max sent num span_rouge, # batch, max sent num, max compression num scores, comp_rouge_ratio, loss_thres=1 ): """ :param decoder_outputs_logit: :param span_rouge: [batch, max_sent, max_compression] :param scores: [timestep, batch, max_compression, 2] :param comp_rouge_ratio: [batch_size, max_sent, max_compression] :return: """ tim, bat = decoder_outputs_logit.size() time, batch, max_span, _ = scores.size() batch_, sent_len, max_sp = span_rouge.size() assert batch_ == batch == bat assert time == tim assert max_sp == max_span goal_rouge_label = torch.ones((tim, batch, max_span), device=self.device, dtype=torch.long, ) * (-1) weights = torch.ones((tim, batch, max_span), device=self.device, dtype=torch.float) decoder_outputs_logit_mask = (decoder_outputs_logit >= 0).unsqueeze(2).expand( (time, batch, max_span)).float().view(-1) decoder_outputs_logit = torch.nn.functional.relu(decoder_outputs_logit).long() z = torch.zeros((1), device=self.device) for tt in range(tim): decoder_outputs_logit_t = decoder_outputs_logit[tt] out = two_dim_index_select(inp=comp_rouge_ratio, index=decoder_outputs_logit_t) label = torch.gt(out, loss_thres).long() mini_mask = torch.gt(out, 0.01).float() # baseline_mask = 1 - torch.lt(torch.abs(out - 0.99), 0.01).float() # baseline will be 0 # weight = torch.max(input=-out + 0.5, other=z) + 1 # weights[tt] = mini_mask * baseline_mask weights[tt] = mini_mask goal_rouge_label[tt] = label probs = scores.view(-1, 2) goal_rouge_label = goal_rouge_label.view(-1) weights = weights.view(-1) loss = self.XELoss(input=probs, target=goal_rouge_label) loss = loss * decoder_outputs_logit_mask * weights return torch.mean(loss) def comp_loss(self, decoder_outputs_logit, # gold label!!!! scores, span_seq_label, # batch, max sent num span_rouge, # batch, max sent num, max compression num comp_rouge_ratio ): t, batch = decoder_outputs_logit.size() t_, batch_, comp_num = scores.size() b, max_sent = span_seq_label.size() # b_, max_sen, max_comp_, _ = span.size() _b, max_sent_, max_comp = span_rouge.size() assert batch == batch_ == b == _b assert max_sent_ == max_sent assert comp_num == max_comp span_seq_label = span_seq_label.long() total_loss = torch.zeros((t, b)).to(self.device) # print(decoder_outputs_logit) # print(span_seq_label) for timestep in range(t): # this is the sent idx for batch_idx in range(b): logit = decoder_outputs_logit[timestep][batch_idx] # print(logit) # decoder_outputs_logit should be the gold label for sentence emission. # if it's 0 or -1, then we skip supervision. if logit < 0: continue ref_rouge_score = comp_rouge_ratio[batch_idx][logit] num_of_compression = ref_rouge_score.size()[0] _supervision_label_msk = (ref_rouge_score > 0.98).float() label = torch.from_numpy(np.arange(num_of_compression)).to(self.device).long() score_t = scores[timestep][batch_idx].unsqueeze(0) # comp num score_t = score_t.expand(num_of_compression, -1) # label = span_seq_label[batch_idx][logit].unsqueeze(0) loss = self.XEloss(score_t, label) # print(loss) loss = _supervision_label_msk * loss total_loss[timestep][batch_idx] = torch.sum(loss) # sent_msk_t = two_dim_index_select(sent_mask, logit) return torch.mean(total_loss) def _dec_compression_one_step(self, predict_compression, sp_meta, word_sent: List[str], keep_threshold: List[float], context: List[List[str]] = None): full_set_len = set(range(len(word_sent))) # max_comp, _ = predict_compression.size preds = [full_set_len.copy() for _ in range(len(keep_threshold))] # Show all of the compression spans stat_compression = {} for comp_idx, comp_meta in enumerate(sp_meta): p = predict_compression[comp_idx][1] node_type, sel_idx, rouge, ratio = comp_meta if node_type != "BASELINE": selected_words = [x for idx, x in enumerate(word_sent) if idx in sel_idx] selected_words_str = "_".join(selected_words) stat_compression["{}".format(selected_words_str)] = { "prob": float("{0:.2f}".format(p)), # float("{0:.2f}".format()) "type": node_type, "rouge": float("{0:.2f}".format(rouge)), "ratio": float("{0:.2f}".format(ratio)), "sel_idx": sel_idx, "len": len(sel_idx) } stat_compression_order = OrderedDict( sorted(stat_compression.items(), key=lambda item: item[1]["prob"], reverse=True)) # Python 3 for idx, _keep_thres in enumerate(keep_threshold): history: List[str] = context[idx] his_set = set((" ".join(history)).split(" ")) for key, value in stat_compression_order.items(): p = value['prob'] sel_idx = value['sel_idx'] sel_txt = set([word_sent[x] for x in sel_idx]) if sel_txt - his_set == set(): # print("Save big!") # print("Context: {}\tCandidate: {}".format(his_set, sel_txt)) preds[idx] = preds[idx] - set(value['sel_idx']) continue if p > _keep_thres: preds[idx] = preds[idx] - set(value['sel_idx']) preds = [list(x) for x in preds] for pred in preds: pred.sort() # Visual output visual_outputs: List[str] = [] words_for_evaluation: List[str] = [] meta_keep_ratio_word = [] for idx, compression in enumerate(preds): output = [word_sent[jdx] if (jdx in compression) else '_' + word_sent[jdx] + '_' for jdx in range(len(word_sent))] visual_outputs.append(" ".join(output)) words = [word_sent[x] for x in compression] meta_keep_ratio_word.append(float(len(words) / len(word_sent))) # meta_kepp_ratio_span.append(1 - float(len(survery['type'][idx]) / len(sp_meta))) words = " ".join(words) words = easy_post_processing(words) # print(words) words_for_evaluation.append(words) d: List[List] = [] for kep_th, vis, words_eva, keep_word_ratio in zip(keep_threshold, visual_outputs, words_for_evaluation, meta_keep_ratio_word): d.append([kep_th, vis, words_eva, keep_word_ratio]) return stat_compression_order, d def decode_inf_deletion(self, sent_decoder_outputs_logit, # time, batch span_prob, # time, batch, max_comp, 2 metadata: List, span_meta: List, span_rouge, # batch, sent, max_comp keep_threshold: List[float] ): batch_size, max_sent_num, max_comp_num = span_rouge.size() t, batsz, max_comp, _ = span_prob.size() span_score = torch.nn.functional.softmax(span_prob, dim=3).cpu().numpy() timestep, batch = sent_decoder_outputs_logit.size() sent_decoder_outputs_logit = sent_decoder_outputs_logit.cpu().data for idx, m in enumerate(metadata): abs_s = [" ".join(s) for s in m["abs_list"]] comp_exe = CompExecutor(span_meta=span_meta[idx], sent_idxs=sent_decoder_outputs_logit[:, idx], prediction_score=span_score[:, idx, :, :], abs_str=abs_s, name=m['name'], doc_list=m["doc_list"], keep_threshold=keep_threshold, part=m['name'], ser_dir=self.valid_tmp_path, ser_fname=self.serilization_name ) # processed_words, del_record, \ # compressions, full_sents, \ bag_pred_eval = comp_exe.run() full_sents: List[List[str]] = comp_exe.full_sents # assemble full sents full_sents = [" ".join(x) for x in full_sents] # visual to console for idx in range(len(keep_threshold)): self.rouge_metrics_compression_dict["{}".format(keep_threshold[idx])](pred=bag_pred_eval[idx], ref=[abs_s], origin=full_sents )
def __init__(self, vocab: Vocabulary, text_field_embedder: TextFieldEmbedder, num_highway_layers: int, phrase_layer: Seq2SeqEncoder, matrix_attention_layer: MatrixAttention, modeling_layer: Seq2SeqEncoder, dropout_prob: float = 0.1, initializer: InitializerApplicator = InitializerApplicator(), regularizer: Optional[RegularizerApplicator] = None, answering_abilities: Iterable[str] = ("passage_span_extraction", "question_span_extraction", "addition_subtraction", "counting")) -> None: super().__init__(vocab, regularizer) # The answering abilities to include in this model self.answering_abilities = list(answering_abilities) text_embed_dim = text_field_embedder.get_output_dim() encoding_in_dim = phrase_layer.get_input_dim() encoding_out_dim = phrase_layer.get_output_dim() modeling_in_dim = modeling_layer.get_input_dim() modeling_out_dim = modeling_layer.get_output_dim() self._text_field_embedder = text_field_embedder self._embedding_proj_layer = torch.nn.Linear(text_embed_dim, encoding_in_dim) self._highway_layer = Highway(encoding_in_dim, num_highway_layers) self._encoding_proj_layer = torch.nn.Linear(encoding_in_dim, encoding_in_dim) self._phrase_layer = phrase_layer self._matrix_attention = matrix_attention_layer self._modeling_proj_layer = torch.nn.Linear(encoding_out_dim * 4, modeling_in_dim) self._modeling_layer = modeling_layer self._passage_weights_predictor = torch.nn.Linear(modeling_out_dim, 1) self._question_weights_predictor = torch.nn.Linear(encoding_out_dim, 1) if len(self.answering_abilities) > 1: self._answer_ability_predictor = FeedForward(modeling_out_dim + encoding_out_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, len(self.answering_abilities)], num_layers=2, dropout=dropout_prob) if "passage_span_extraction" in self.answering_abilities: self._passage_span_extraction_index = self.answering_abilities.index("passage_span_extraction") self._passage_span_start_predictor = FeedForward(modeling_out_dim * 2, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 1], num_layers=2) self._passage_span_end_predictor = FeedForward(modeling_out_dim * 2, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 1], num_layers=2) if "question_span_extraction" in answering_abilities: self._question_span_extraction_index = self.answering_abilities.index("question_span_extraction") self._question_span_start_predictor = FeedForward(modeling_out_dim * 2, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 1], num_layers=2) self._question_span_end_predictor = FeedForward(modeling_out_dim * 2, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 1], num_layers=2) if "addition_subtraction" in answering_abilities: self._addition_subtraction_index = self.answering_abilities.index("addition_subtraction") self._number_sign_predictor = FeedForward(modeling_out_dim * 3, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 3], num_layers=2) if "counting" in answering_abilities: self._counting_index = self.answering_abilities.index("counting") self._count_number_predictor = FeedForward(modeling_out_dim, activations=[Activation.by_name('relu')(), Activation.by_name('linear')()], hidden_dims=[modeling_out_dim, 10], num_layers=2) self._drop_metrics = DropEmAndF1() self._dropout = torch.nn.Dropout(p=dropout_prob) initializer(self)
initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) Used to initialize the model parameters. regularizer : ``RegularizerApplicator``, optional (default=``None``) If provided, will be used to calculate the regularization penalty during training. """ lstm = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) inference = PytorchSeq2SeqWrapper( torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) # esim = PytorchSeq2SeqWrapper(torch.nn.ESIM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True)) encoder_dim = word_embeddings.get_output_dim() projection_feedforward = FeedForward(encoder_dim * 4, 1, inference.get_input_dim(), Activation.by_name("elu")()) # (batch_size, model_dim * 2 * 4) output_feedforward = FeedForward(lstm.get_output_dim() * 4, 1, 2, Activation.by_name("elu")()) output_logit = torch.nn.Linear(in_features=2, out_features=2) simfunc = BilinearSimilarity(encoder_dim, encoder_dim) model = ESIM(vocab=vocab, text_field_embedder=word_embeddings, encoder=lstm, inference_encoder=inference, similarity_function=simfunc,
model = BertModel.from_pretrained('bert-base-uncased') token_embedding = BertEmbedder(model) PROJECT_DIM = 768 else: print("Error: Some weird Embedding type", EMBEDDING_TYPE) exit() word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding}) HIDDEN_DIM = 200 params = Params({ 'input_dim': PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) attend_feedforward = FeedForward.from_params(params) similarity_function = DotProductSimilarity() params = Params({ 'input_dim': 2 * PROJECT_DIM, 'hidden_dims': HIDDEN_DIM, 'activations': 'relu', 'num_layers': NUM_LAYERS, 'dropout': DROPOUT }) compare_feedforward = FeedForward.from_params(params) params = Params({ 'input_dim': 2 * HIDDEN_DIM, 'hidden_dims': 1, 'activations': 'linear', 'num_layers': 1 })
def __init__( self, vocab: Vocabulary, source_embedder: TextFieldEmbedder, # just Embedding layer encoder1: Seq2SeqEncoder, # user encoder encoder2: Seq2SeqEncoder, # system encoder attention: Attention, # decoding attention max_decoding_steps: int = 200, # max timesteps of decoder beam_size: int = 3, # beam search parameter target_namespace: str = "target_tokens", # two separate vocabulary target_embedding_dim: int = None, # target word embedding dimension scheduled_sampling_ratio: float = 0., # maybe unnecessary projection_dim: int = None, # use_coverage: bool = False, # coverage penalty, optional coverage_loss_weight: float = None, domain_lambda: float = 0.5, # the penalty weight in final loss function, need to be tuned initializer: InitializerApplicator = InitializerApplicator() ) -> None: super(SPNet, self).__init__(vocab) # General variables # target_namespace: target_tokens; source_namespace: tokens; self._target_namespace = target_namespace self._start_index = self.vocab.get_token_index(START_SYMBOL, self._target_namespace) self._end_index = self.vocab.get_token_index(END_SYMBOL, self._target_namespace) self._source_unk_index = self.vocab.get_token_index(DEFAULT_OOV_TOKEN) self._target_unk_index = self.vocab.get_token_index( DEFAULT_OOV_TOKEN, self._target_namespace) self._source_vocab_size = self.vocab.get_vocab_size() self._target_vocab_size = self.vocab.get_vocab_size( self._target_namespace) # Encoder setting self._source_embedder = source_embedder self._encoder1 = encoder1 self._encoder2 = encoder2 # We assume that the 2 encoders have the same hidden state size self._encoder_output_dim = self._encoder1.get_output_dim() # Decoder setting self._target_embedding_dim = target_embedding_dim or source_embedder.get_output_dim( ) self._num_classes = self.vocab.get_vocab_size(self._target_namespace) self._target_embedder = Embedding(self._num_classes, self._target_embedding_dim) self._decoder_input_dim = self._encoder_output_dim * 2 # default as the decoder_output_dim # input projection of decoder: [context_attn, target_emb] -> [decoder_input_dim] self._input_projection_layer = Linear( self._target_embedding_dim + self._encoder_output_dim * 2, self._decoder_input_dim) self._decoder_output_dim = self._encoder_output_dim * 2 self._decoder_cell = LSTMCell(self._decoder_input_dim, self._decoder_output_dim) self._projection_dim = projection_dim or self._source_embedder.get_output_dim( ) self._output_projection_layer = Linear(self._decoder_output_dim, self._num_classes) self._p_gen_layer = Linear( self._encoder_output_dim * 2 + self._decoder_output_dim * 2 + self._decoder_input_dim, 1) self._attention = attention # coverage penalty setting self._use_coverage = use_coverage self._coverage_loss_weight = coverage_loss_weight self._eps = 1e-45 # Decoding strategy setting self._scheduled_sampling_ratio = scheduled_sampling_ratio self._max_decoding_steps = max_decoding_steps self._beam_search = BeamSearch(self._end_index, max_steps=max_decoding_steps, beam_size=beam_size) # multitasking of domain classification self._domain_penalty = domain_lambda # penalty term = 0.5 as default self._classifier_params = Params({ "input_dim": self._decoder_output_dim, "hidden_dims": [128, 7], "activations": ["relu", "linear"], "dropout": [0.2, 0.0], "num_layers": 2 }) self._domain_classifier = FeedForward.from_params( self._classifier_params) initializer(self)